xref: /openbmc/qemu/migration/rdma.c (revision 8f3f329f5e0117bd1a23a79ab751f8a7d3471e4b)
1329c9b10SDr. David Alan Gilbert /*
2329c9b10SDr. David Alan Gilbert  * RDMA protocol and interfaces
3329c9b10SDr. David Alan Gilbert  *
4329c9b10SDr. David Alan Gilbert  * Copyright IBM, Corp. 2010-2013
56ddd2d76SDaniel P. Berrange  * Copyright Red Hat, Inc. 2015-2016
6329c9b10SDr. David Alan Gilbert  *
7329c9b10SDr. David Alan Gilbert  * Authors:
8329c9b10SDr. David Alan Gilbert  *  Michael R. Hines <mrhines@us.ibm.com>
9329c9b10SDr. David Alan Gilbert  *  Jiuxing Liu <jl@us.ibm.com>
106ddd2d76SDaniel P. Berrange  *  Daniel P. Berrange <berrange@redhat.com>
11329c9b10SDr. David Alan Gilbert  *
12329c9b10SDr. David Alan Gilbert  * This work is licensed under the terms of the GNU GPL, version 2 or
13329c9b10SDr. David Alan Gilbert  * later.  See the COPYING file in the top-level directory.
14329c9b10SDr. David Alan Gilbert  *
15329c9b10SDr. David Alan Gilbert  */
160b8fa32fSMarkus Armbruster 
171393a485SPeter Maydell #include "qemu/osdep.h"
18da34e65cSMarkus Armbruster #include "qapi/error.h"
19f348b6d1SVeronia Bahaa #include "qemu/cutils.h"
20c61d2faaSJuan Quintela #include "exec/target_page.h"
21e1a3eceeSJuan Quintela #include "rdma.h"
226666c96aSJuan Quintela #include "migration.h"
23c61d2faaSJuan Quintela #include "migration-stats.h"
2408a0aee1SJuan Quintela #include "qemu-file.h"
257b1e1a22SJuan Quintela #include "ram.h"
26d49b6836SMarkus Armbruster #include "qemu/error-report.h"
27329c9b10SDr. David Alan Gilbert #include "qemu/main-loop.h"
280b8fa32fSMarkus Armbruster #include "qemu/module.h"
29d4842052SMarkus Armbruster #include "qemu/rcu.h"
30329c9b10SDr. David Alan Gilbert #include "qemu/sockets.h"
31329c9b10SDr. David Alan Gilbert #include "qemu/bitmap.h"
3210817bf0SDaniel P. Berrange #include "qemu/coroutine.h"
335f1f1902SDavid Hildenbrand #include "exec/memory.h"
34329c9b10SDr. David Alan Gilbert #include <sys/socket.h>
35329c9b10SDr. David Alan Gilbert #include <netdb.h>
36329c9b10SDr. David Alan Gilbert #include <arpa/inet.h>
37329c9b10SDr. David Alan Gilbert #include <rdma/rdma_cma.h>
38733252deSDr. David Alan Gilbert #include "trace.h"
39db1015e9SEduardo Habkost #include "qom/object.h"
4017cba690SJuan Quintela #include "options.h"
41e49e49ddSLi Zhijian #include <poll.h>
42329c9b10SDr. David Alan Gilbert 
43329c9b10SDr. David Alan Gilbert #define RDMA_RESOLVE_TIMEOUT_MS 10000
44329c9b10SDr. David Alan Gilbert 
45329c9b10SDr. David Alan Gilbert /* Do not merge data if larger than this. */
46329c9b10SDr. David Alan Gilbert #define RDMA_MERGE_MAX (2 * 1024 * 1024)
47329c9b10SDr. David Alan Gilbert #define RDMA_SIGNALED_SEND_MAX (RDMA_MERGE_MAX / 4096)
48329c9b10SDr. David Alan Gilbert 
49329c9b10SDr. David Alan Gilbert #define RDMA_REG_CHUNK_SHIFT 20 /* 1 MB */
50329c9b10SDr. David Alan Gilbert 
51329c9b10SDr. David Alan Gilbert /*
52329c9b10SDr. David Alan Gilbert  * This is only for non-live state being migrated.
53329c9b10SDr. David Alan Gilbert  * Instead of RDMA_WRITE messages, we use RDMA_SEND
54329c9b10SDr. David Alan Gilbert  * messages for that state, which requires a different
55329c9b10SDr. David Alan Gilbert  * delivery design than main memory.
56329c9b10SDr. David Alan Gilbert  */
57329c9b10SDr. David Alan Gilbert #define RDMA_SEND_INCREMENT 32768
58329c9b10SDr. David Alan Gilbert 
59329c9b10SDr. David Alan Gilbert /*
60329c9b10SDr. David Alan Gilbert  * Maximum size infiniband SEND message
61329c9b10SDr. David Alan Gilbert  */
62329c9b10SDr. David Alan Gilbert #define RDMA_CONTROL_MAX_BUFFER (512 * 1024)
63329c9b10SDr. David Alan Gilbert #define RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE 4096
64329c9b10SDr. David Alan Gilbert 
65329c9b10SDr. David Alan Gilbert #define RDMA_CONTROL_VERSION_CURRENT 1
66329c9b10SDr. David Alan Gilbert /*
67329c9b10SDr. David Alan Gilbert  * Capabilities for negotiation.
68329c9b10SDr. David Alan Gilbert  */
69329c9b10SDr. David Alan Gilbert #define RDMA_CAPABILITY_PIN_ALL 0x01
70329c9b10SDr. David Alan Gilbert 
71329c9b10SDr. David Alan Gilbert /*
72329c9b10SDr. David Alan Gilbert  * Add the other flags above to this list of known capabilities
73329c9b10SDr. David Alan Gilbert  * as they are introduced.
74329c9b10SDr. David Alan Gilbert  */
75329c9b10SDr. David Alan Gilbert static uint32_t known_capabilities = RDMA_CAPABILITY_PIN_ALL;
76329c9b10SDr. David Alan Gilbert 
77329c9b10SDr. David Alan Gilbert /*
78329c9b10SDr. David Alan Gilbert  * A work request ID is 64-bits and we split up these bits
79329c9b10SDr. David Alan Gilbert  * into 3 parts:
80329c9b10SDr. David Alan Gilbert  *
81329c9b10SDr. David Alan Gilbert  * bits 0-15 : type of control message, 2^16
82329c9b10SDr. David Alan Gilbert  * bits 16-29: ram block index, 2^14
83329c9b10SDr. David Alan Gilbert  * bits 30-63: ram block chunk number, 2^34
84329c9b10SDr. David Alan Gilbert  *
85329c9b10SDr. David Alan Gilbert  * The last two bit ranges are only used for RDMA writes,
86329c9b10SDr. David Alan Gilbert  * in order to track their completion and potentially
87329c9b10SDr. David Alan Gilbert  * also track unregistration status of the message.
88329c9b10SDr. David Alan Gilbert  */
89329c9b10SDr. David Alan Gilbert #define RDMA_WRID_TYPE_SHIFT  0UL
90329c9b10SDr. David Alan Gilbert #define RDMA_WRID_BLOCK_SHIFT 16UL
91329c9b10SDr. David Alan Gilbert #define RDMA_WRID_CHUNK_SHIFT 30UL
92329c9b10SDr. David Alan Gilbert 
93329c9b10SDr. David Alan Gilbert #define RDMA_WRID_TYPE_MASK \
94329c9b10SDr. David Alan Gilbert     ((1UL << RDMA_WRID_BLOCK_SHIFT) - 1UL)
95329c9b10SDr. David Alan Gilbert 
96329c9b10SDr. David Alan Gilbert #define RDMA_WRID_BLOCK_MASK \
97329c9b10SDr. David Alan Gilbert     (~RDMA_WRID_TYPE_MASK & ((1UL << RDMA_WRID_CHUNK_SHIFT) - 1UL))
98329c9b10SDr. David Alan Gilbert 
99329c9b10SDr. David Alan Gilbert #define RDMA_WRID_CHUNK_MASK (~RDMA_WRID_BLOCK_MASK & ~RDMA_WRID_TYPE_MASK)
100329c9b10SDr. David Alan Gilbert 
101329c9b10SDr. David Alan Gilbert /*
102329c9b10SDr. David Alan Gilbert  * RDMA migration protocol:
103329c9b10SDr. David Alan Gilbert  * 1. RDMA Writes (data messages, i.e. RAM)
104329c9b10SDr. David Alan Gilbert  * 2. IB Send/Recv (control channel messages)
105329c9b10SDr. David Alan Gilbert  */
106329c9b10SDr. David Alan Gilbert enum {
107329c9b10SDr. David Alan Gilbert     RDMA_WRID_NONE = 0,
108329c9b10SDr. David Alan Gilbert     RDMA_WRID_RDMA_WRITE = 1,
109329c9b10SDr. David Alan Gilbert     RDMA_WRID_SEND_CONTROL = 2000,
110329c9b10SDr. David Alan Gilbert     RDMA_WRID_RECV_CONTROL = 4000,
111329c9b10SDr. David Alan Gilbert };
112329c9b10SDr. David Alan Gilbert 
113329c9b10SDr. David Alan Gilbert /*
114329c9b10SDr. David Alan Gilbert  * Work request IDs for IB SEND messages only (not RDMA writes).
115329c9b10SDr. David Alan Gilbert  * This is used by the migration protocol to transmit
116329c9b10SDr. David Alan Gilbert  * control messages (such as device state and registration commands)
117329c9b10SDr. David Alan Gilbert  *
118329c9b10SDr. David Alan Gilbert  * We could use more WRs, but we have enough for now.
119329c9b10SDr. David Alan Gilbert  */
120329c9b10SDr. David Alan Gilbert enum {
121329c9b10SDr. David Alan Gilbert     RDMA_WRID_READY = 0,
122329c9b10SDr. David Alan Gilbert     RDMA_WRID_DATA,
123329c9b10SDr. David Alan Gilbert     RDMA_WRID_CONTROL,
124329c9b10SDr. David Alan Gilbert     RDMA_WRID_MAX,
125329c9b10SDr. David Alan Gilbert };
126329c9b10SDr. David Alan Gilbert 
127329c9b10SDr. David Alan Gilbert /*
128329c9b10SDr. David Alan Gilbert  * SEND/RECV IB Control Messages.
129329c9b10SDr. David Alan Gilbert  */
130329c9b10SDr. David Alan Gilbert enum {
131329c9b10SDr. David Alan Gilbert     RDMA_CONTROL_NONE = 0,
132329c9b10SDr. David Alan Gilbert     RDMA_CONTROL_ERROR,
133329c9b10SDr. David Alan Gilbert     RDMA_CONTROL_READY,               /* ready to receive */
134329c9b10SDr. David Alan Gilbert     RDMA_CONTROL_QEMU_FILE,           /* QEMUFile-transmitted bytes */
135329c9b10SDr. David Alan Gilbert     RDMA_CONTROL_RAM_BLOCKS_REQUEST,  /* RAMBlock synchronization */
136329c9b10SDr. David Alan Gilbert     RDMA_CONTROL_RAM_BLOCKS_RESULT,   /* RAMBlock synchronization */
137329c9b10SDr. David Alan Gilbert     RDMA_CONTROL_COMPRESS,            /* page contains repeat values */
138329c9b10SDr. David Alan Gilbert     RDMA_CONTROL_REGISTER_REQUEST,    /* dynamic page registration */
139329c9b10SDr. David Alan Gilbert     RDMA_CONTROL_REGISTER_RESULT,     /* key to use after registration */
140329c9b10SDr. David Alan Gilbert     RDMA_CONTROL_REGISTER_FINISHED,   /* current iteration finished */
141329c9b10SDr. David Alan Gilbert     RDMA_CONTROL_UNREGISTER_REQUEST,  /* dynamic UN-registration */
142329c9b10SDr. David Alan Gilbert     RDMA_CONTROL_UNREGISTER_FINISHED, /* unpinning finished */
143329c9b10SDr. David Alan Gilbert };
144329c9b10SDr. David Alan Gilbert 
145329c9b10SDr. David Alan Gilbert 
146329c9b10SDr. David Alan Gilbert /*
147329c9b10SDr. David Alan Gilbert  * Memory and MR structures used to represent an IB Send/Recv work request.
148329c9b10SDr. David Alan Gilbert  * This is *not* used for RDMA writes, only IB Send/Recv.
149329c9b10SDr. David Alan Gilbert  */
150329c9b10SDr. David Alan Gilbert typedef struct {
151329c9b10SDr. David Alan Gilbert     uint8_t  control[RDMA_CONTROL_MAX_BUFFER]; /* actual buffer to register */
152329c9b10SDr. David Alan Gilbert     struct   ibv_mr *control_mr;               /* registration metadata */
153329c9b10SDr. David Alan Gilbert     size_t   control_len;                      /* length of the message */
154329c9b10SDr. David Alan Gilbert     uint8_t *control_curr;                     /* start of unconsumed bytes */
155329c9b10SDr. David Alan Gilbert } RDMAWorkRequestData;
156329c9b10SDr. David Alan Gilbert 
157329c9b10SDr. David Alan Gilbert /*
158329c9b10SDr. David Alan Gilbert  * Negotiate RDMA capabilities during connection-setup time.
159329c9b10SDr. David Alan Gilbert  */
160329c9b10SDr. David Alan Gilbert typedef struct {
161329c9b10SDr. David Alan Gilbert     uint32_t version;
162329c9b10SDr. David Alan Gilbert     uint32_t flags;
163329c9b10SDr. David Alan Gilbert } RDMACapabilities;
164329c9b10SDr. David Alan Gilbert 
caps_to_network(RDMACapabilities * cap)165329c9b10SDr. David Alan Gilbert static void caps_to_network(RDMACapabilities *cap)
166329c9b10SDr. David Alan Gilbert {
167329c9b10SDr. David Alan Gilbert     cap->version = htonl(cap->version);
168329c9b10SDr. David Alan Gilbert     cap->flags = htonl(cap->flags);
169329c9b10SDr. David Alan Gilbert }
170329c9b10SDr. David Alan Gilbert 
network_to_caps(RDMACapabilities * cap)171329c9b10SDr. David Alan Gilbert static void network_to_caps(RDMACapabilities *cap)
172329c9b10SDr. David Alan Gilbert {
173329c9b10SDr. David Alan Gilbert     cap->version = ntohl(cap->version);
174329c9b10SDr. David Alan Gilbert     cap->flags = ntohl(cap->flags);
175329c9b10SDr. David Alan Gilbert }
176329c9b10SDr. David Alan Gilbert 
177329c9b10SDr. David Alan Gilbert /*
178329c9b10SDr. David Alan Gilbert  * Representation of a RAMBlock from an RDMA perspective.
179329c9b10SDr. David Alan Gilbert  * This is not transmitted, only local.
180329c9b10SDr. David Alan Gilbert  * This and subsequent structures cannot be linked lists
181329c9b10SDr. David Alan Gilbert  * because we're using a single IB message to transmit
182329c9b10SDr. David Alan Gilbert  * the information. It's small anyway, so a list is overkill.
183329c9b10SDr. David Alan Gilbert  */
184329c9b10SDr. David Alan Gilbert typedef struct RDMALocalBlock {
1854fb5364bSDr. David Alan Gilbert     char          *block_name;
186329c9b10SDr. David Alan Gilbert     uint8_t       *local_host_addr; /* local virtual address */
187329c9b10SDr. David Alan Gilbert     uint64_t       remote_host_addr; /* remote virtual address */
188329c9b10SDr. David Alan Gilbert     uint64_t       offset;
189329c9b10SDr. David Alan Gilbert     uint64_t       length;
190329c9b10SDr. David Alan Gilbert     struct         ibv_mr **pmr;    /* MRs for chunk-level registration */
191329c9b10SDr. David Alan Gilbert     struct         ibv_mr *mr;      /* MR for non-chunk-level registration */
192329c9b10SDr. David Alan Gilbert     uint32_t      *remote_keys;     /* rkeys for chunk-level registration */
193329c9b10SDr. David Alan Gilbert     uint32_t       remote_rkey;     /* rkeys for non-chunk-level registration */
194329c9b10SDr. David Alan Gilbert     int            index;           /* which block are we */
195e4d63320SDr. David Alan Gilbert     unsigned int   src_index;       /* (Only used on dest) */
196329c9b10SDr. David Alan Gilbert     bool           is_ram_block;
197329c9b10SDr. David Alan Gilbert     int            nb_chunks;
198329c9b10SDr. David Alan Gilbert     unsigned long *transit_bitmap;
199329c9b10SDr. David Alan Gilbert     unsigned long *unregister_bitmap;
200329c9b10SDr. David Alan Gilbert } RDMALocalBlock;
201329c9b10SDr. David Alan Gilbert 
202329c9b10SDr. David Alan Gilbert /*
203329c9b10SDr. David Alan Gilbert  * Also represents a RAMblock, but only on the dest.
204329c9b10SDr. David Alan Gilbert  * This gets transmitted by the dest during connection-time
205329c9b10SDr. David Alan Gilbert  * to the source VM and then is used to populate the
206329c9b10SDr. David Alan Gilbert  * corresponding RDMALocalBlock with
207329c9b10SDr. David Alan Gilbert  * the information needed to perform the actual RDMA.
208329c9b10SDr. David Alan Gilbert  */
209a97270adSDr. David Alan Gilbert typedef struct QEMU_PACKED RDMADestBlock {
210329c9b10SDr. David Alan Gilbert     uint64_t remote_host_addr;
211329c9b10SDr. David Alan Gilbert     uint64_t offset;
212329c9b10SDr. David Alan Gilbert     uint64_t length;
213329c9b10SDr. David Alan Gilbert     uint32_t remote_rkey;
214329c9b10SDr. David Alan Gilbert     uint32_t padding;
215a97270adSDr. David Alan Gilbert } RDMADestBlock;
216329c9b10SDr. David Alan Gilbert 
control_desc(unsigned int rdma_control)217482a33c5SDr. David Alan Gilbert static const char *control_desc(unsigned int rdma_control)
218482a33c5SDr. David Alan Gilbert {
219482a33c5SDr. David Alan Gilbert     static const char *strs[] = {
220482a33c5SDr. David Alan Gilbert         [RDMA_CONTROL_NONE] = "NONE",
221482a33c5SDr. David Alan Gilbert         [RDMA_CONTROL_ERROR] = "ERROR",
222482a33c5SDr. David Alan Gilbert         [RDMA_CONTROL_READY] = "READY",
223482a33c5SDr. David Alan Gilbert         [RDMA_CONTROL_QEMU_FILE] = "QEMU FILE",
224482a33c5SDr. David Alan Gilbert         [RDMA_CONTROL_RAM_BLOCKS_REQUEST] = "RAM BLOCKS REQUEST",
225482a33c5SDr. David Alan Gilbert         [RDMA_CONTROL_RAM_BLOCKS_RESULT] = "RAM BLOCKS RESULT",
226482a33c5SDr. David Alan Gilbert         [RDMA_CONTROL_COMPRESS] = "COMPRESS",
227482a33c5SDr. David Alan Gilbert         [RDMA_CONTROL_REGISTER_REQUEST] = "REGISTER REQUEST",
228482a33c5SDr. David Alan Gilbert         [RDMA_CONTROL_REGISTER_RESULT] = "REGISTER RESULT",
229482a33c5SDr. David Alan Gilbert         [RDMA_CONTROL_REGISTER_FINISHED] = "REGISTER FINISHED",
230482a33c5SDr. David Alan Gilbert         [RDMA_CONTROL_UNREGISTER_REQUEST] = "UNREGISTER REQUEST",
231482a33c5SDr. David Alan Gilbert         [RDMA_CONTROL_UNREGISTER_FINISHED] = "UNREGISTER FINISHED",
232482a33c5SDr. David Alan Gilbert     };
233482a33c5SDr. David Alan Gilbert 
234482a33c5SDr. David Alan Gilbert     if (rdma_control > RDMA_CONTROL_UNREGISTER_FINISHED) {
235482a33c5SDr. David Alan Gilbert         return "??BAD CONTROL VALUE??";
236482a33c5SDr. David Alan Gilbert     }
237482a33c5SDr. David Alan Gilbert 
238482a33c5SDr. David Alan Gilbert     return strs[rdma_control];
239482a33c5SDr. David Alan Gilbert }
240482a33c5SDr. David Alan Gilbert 
24144ce1b5dSNick Briggs #if !defined(htonll)
htonll(uint64_t v)242329c9b10SDr. David Alan Gilbert static uint64_t htonll(uint64_t v)
243329c9b10SDr. David Alan Gilbert {
244329c9b10SDr. David Alan Gilbert     union { uint32_t lv[2]; uint64_t llv; } u;
245329c9b10SDr. David Alan Gilbert     u.lv[0] = htonl(v >> 32);
246329c9b10SDr. David Alan Gilbert     u.lv[1] = htonl(v & 0xFFFFFFFFULL);
247329c9b10SDr. David Alan Gilbert     return u.llv;
248329c9b10SDr. David Alan Gilbert }
24944ce1b5dSNick Briggs #endif
250329c9b10SDr. David Alan Gilbert 
25144ce1b5dSNick Briggs #if !defined(ntohll)
ntohll(uint64_t v)252cbfc71b5SBihong Yu static uint64_t ntohll(uint64_t v)
253cbfc71b5SBihong Yu {
254329c9b10SDr. David Alan Gilbert     union { uint32_t lv[2]; uint64_t llv; } u;
255329c9b10SDr. David Alan Gilbert     u.llv = v;
256329c9b10SDr. David Alan Gilbert     return ((uint64_t)ntohl(u.lv[0]) << 32) | (uint64_t) ntohl(u.lv[1]);
257329c9b10SDr. David Alan Gilbert }
25844ce1b5dSNick Briggs #endif
259329c9b10SDr. David Alan Gilbert 
dest_block_to_network(RDMADestBlock * db)260a97270adSDr. David Alan Gilbert static void dest_block_to_network(RDMADestBlock *db)
261329c9b10SDr. David Alan Gilbert {
262a97270adSDr. David Alan Gilbert     db->remote_host_addr = htonll(db->remote_host_addr);
263a97270adSDr. David Alan Gilbert     db->offset = htonll(db->offset);
264a97270adSDr. David Alan Gilbert     db->length = htonll(db->length);
265a97270adSDr. David Alan Gilbert     db->remote_rkey = htonl(db->remote_rkey);
266329c9b10SDr. David Alan Gilbert }
267329c9b10SDr. David Alan Gilbert 
network_to_dest_block(RDMADestBlock * db)268a97270adSDr. David Alan Gilbert static void network_to_dest_block(RDMADestBlock *db)
269329c9b10SDr. David Alan Gilbert {
270a97270adSDr. David Alan Gilbert     db->remote_host_addr = ntohll(db->remote_host_addr);
271a97270adSDr. David Alan Gilbert     db->offset = ntohll(db->offset);
272a97270adSDr. David Alan Gilbert     db->length = ntohll(db->length);
273a97270adSDr. David Alan Gilbert     db->remote_rkey = ntohl(db->remote_rkey);
274329c9b10SDr. David Alan Gilbert }
275329c9b10SDr. David Alan Gilbert 
276329c9b10SDr. David Alan Gilbert /*
277329c9b10SDr. David Alan Gilbert  * Virtual address of the above structures used for transmitting
278329c9b10SDr. David Alan Gilbert  * the RAMBlock descriptions at connection-time.
279329c9b10SDr. David Alan Gilbert  * This structure is *not* transmitted.
280329c9b10SDr. David Alan Gilbert  */
281329c9b10SDr. David Alan Gilbert typedef struct RDMALocalBlocks {
282329c9b10SDr. David Alan Gilbert     int nb_blocks;
283329c9b10SDr. David Alan Gilbert     bool     init;             /* main memory init complete */
284329c9b10SDr. David Alan Gilbert     RDMALocalBlock *block;
285329c9b10SDr. David Alan Gilbert } RDMALocalBlocks;
286329c9b10SDr. David Alan Gilbert 
287329c9b10SDr. David Alan Gilbert /*
288329c9b10SDr. David Alan Gilbert  * Main data structure for RDMA state.
289329c9b10SDr. David Alan Gilbert  * While there is only one copy of this structure being allocated right now,
290329c9b10SDr. David Alan Gilbert  * this is the place where one would start if you wanted to consider
291329c9b10SDr. David Alan Gilbert  * having more than one RDMA connection open at the same time.
292329c9b10SDr. David Alan Gilbert  */
293329c9b10SDr. David Alan Gilbert typedef struct RDMAContext {
294329c9b10SDr. David Alan Gilbert     char *host;
295329c9b10SDr. David Alan Gilbert     int port;
296329c9b10SDr. David Alan Gilbert 
297329c9b10SDr. David Alan Gilbert     RDMAWorkRequestData wr_data[RDMA_WRID_MAX];
298329c9b10SDr. David Alan Gilbert 
299329c9b10SDr. David Alan Gilbert     /*
300329c9b10SDr. David Alan Gilbert      * This is used by *_exchange_send() to figure out whether or not
301329c9b10SDr. David Alan Gilbert      * the initial "READY" message has already been received or not.
302329c9b10SDr. David Alan Gilbert      * This is because other functions may potentially poll() and detect
303329c9b10SDr. David Alan Gilbert      * the READY message before send() does, in which case we need to
304329c9b10SDr. David Alan Gilbert      * know if it completed.
305329c9b10SDr. David Alan Gilbert      */
306329c9b10SDr. David Alan Gilbert     int control_ready_expected;
307329c9b10SDr. David Alan Gilbert 
308329c9b10SDr. David Alan Gilbert     /* number of outstanding writes */
309329c9b10SDr. David Alan Gilbert     int nb_sent;
310329c9b10SDr. David Alan Gilbert 
311329c9b10SDr. David Alan Gilbert     /* store info about current buffer so that we can
312329c9b10SDr. David Alan Gilbert        merge it with future sends */
313329c9b10SDr. David Alan Gilbert     uint64_t current_addr;
314329c9b10SDr. David Alan Gilbert     uint64_t current_length;
315329c9b10SDr. David Alan Gilbert     /* index of ram block the current buffer belongs to */
316329c9b10SDr. David Alan Gilbert     int current_index;
317329c9b10SDr. David Alan Gilbert     /* index of the chunk in the current ram block */
318329c9b10SDr. David Alan Gilbert     int current_chunk;
319329c9b10SDr. David Alan Gilbert 
320329c9b10SDr. David Alan Gilbert     bool pin_all;
321329c9b10SDr. David Alan Gilbert 
322329c9b10SDr. David Alan Gilbert     /*
323329c9b10SDr. David Alan Gilbert      * infiniband-specific variables for opening the device
324329c9b10SDr. David Alan Gilbert      * and maintaining connection state and so forth.
325329c9b10SDr. David Alan Gilbert      *
326329c9b10SDr. David Alan Gilbert      * cm_id also has ibv_context, rdma_event_channel, and ibv_qp in
327329c9b10SDr. David Alan Gilbert      * cm_id->verbs, cm_id->channel, and cm_id->qp.
328329c9b10SDr. David Alan Gilbert      */
329329c9b10SDr. David Alan Gilbert     struct rdma_cm_id *cm_id;               /* connection manager ID */
330329c9b10SDr. David Alan Gilbert     struct rdma_cm_id *listen_id;
331329c9b10SDr. David Alan Gilbert     bool connected;
332329c9b10SDr. David Alan Gilbert 
333329c9b10SDr. David Alan Gilbert     struct ibv_context          *verbs;
334329c9b10SDr. David Alan Gilbert     struct rdma_event_channel   *channel;
335329c9b10SDr. David Alan Gilbert     struct ibv_qp *qp;                      /* queue pair */
336b390afd8SLi Zhijian     struct ibv_comp_channel *recv_comp_channel;  /* recv completion channel */
337b390afd8SLi Zhijian     struct ibv_comp_channel *send_comp_channel;  /* send completion channel */
338329c9b10SDr. David Alan Gilbert     struct ibv_pd *pd;                      /* protection domain */
339b390afd8SLi Zhijian     struct ibv_cq *recv_cq;                 /* recvieve completion queue */
340b390afd8SLi Zhijian     struct ibv_cq *send_cq;                 /* send completion queue */
341329c9b10SDr. David Alan Gilbert 
342329c9b10SDr. David Alan Gilbert     /*
343329c9b10SDr. David Alan Gilbert      * If a previous write failed (perhaps because of a failed
344329c9b10SDr. David Alan Gilbert      * memory registration, then do not attempt any future work
345329c9b10SDr. David Alan Gilbert      * and remember the error state.
346329c9b10SDr. David Alan Gilbert      */
347b86c94a4SMarkus Armbruster     bool errored;
34889997ac3SMarkus Armbruster     bool error_reported;
34989997ac3SMarkus Armbruster     bool received_error;
350329c9b10SDr. David Alan Gilbert 
351329c9b10SDr. David Alan Gilbert     /*
352329c9b10SDr. David Alan Gilbert      * Description of ram blocks used throughout the code.
353329c9b10SDr. David Alan Gilbert      */
354329c9b10SDr. David Alan Gilbert     RDMALocalBlocks local_ram_blocks;
355a97270adSDr. David Alan Gilbert     RDMADestBlock  *dest_blocks;
356329c9b10SDr. David Alan Gilbert 
357e4d63320SDr. David Alan Gilbert     /* Index of the next RAMBlock received during block registration */
358e4d63320SDr. David Alan Gilbert     unsigned int    next_src_index;
359e4d63320SDr. David Alan Gilbert 
360329c9b10SDr. David Alan Gilbert     /*
361329c9b10SDr. David Alan Gilbert      * Migration on *destination* started.
362329c9b10SDr. David Alan Gilbert      * Then use coroutine yield function.
363329c9b10SDr. David Alan Gilbert      * Source runs in a thread, so we don't care.
364329c9b10SDr. David Alan Gilbert      */
365329c9b10SDr. David Alan Gilbert     int migration_started_on_destination;
366329c9b10SDr. David Alan Gilbert 
367329c9b10SDr. David Alan Gilbert     int total_registrations;
368329c9b10SDr. David Alan Gilbert     int total_writes;
369329c9b10SDr. David Alan Gilbert 
370329c9b10SDr. David Alan Gilbert     int unregister_current, unregister_next;
371329c9b10SDr. David Alan Gilbert     uint64_t unregistrations[RDMA_SIGNALED_SEND_MAX];
372329c9b10SDr. David Alan Gilbert 
373329c9b10SDr. David Alan Gilbert     GHashTable *blockmap;
37455cc1b59SLidong Chen 
37555cc1b59SLidong Chen     /* the RDMAContext for return path */
37655cc1b59SLidong Chen     struct RDMAContext *return_path;
37755cc1b59SLidong Chen     bool is_return_path;
378329c9b10SDr. David Alan Gilbert } RDMAContext;
379329c9b10SDr. David Alan Gilbert 
3806ddd2d76SDaniel P. Berrange #define TYPE_QIO_CHANNEL_RDMA "qio-channel-rdma"
3818063396bSEduardo Habkost OBJECT_DECLARE_SIMPLE_TYPE(QIOChannelRDMA, QIO_CHANNEL_RDMA)
3826ddd2d76SDaniel P. Berrange 
3836ddd2d76SDaniel P. Berrange 
3846ddd2d76SDaniel P. Berrange 
3856ddd2d76SDaniel P. Berrange struct QIOChannelRDMA {
3866ddd2d76SDaniel P. Berrange     QIOChannel parent;
38774637e6fSLidong Chen     RDMAContext *rdmain;
38874637e6fSLidong Chen     RDMAContext *rdmaout;
3896ddd2d76SDaniel P. Berrange     QEMUFile *file;
3906ddd2d76SDaniel P. Berrange     bool blocking; /* XXX we don't actually honour this yet */
3916ddd2d76SDaniel P. Berrange };
392329c9b10SDr. David Alan Gilbert 
393329c9b10SDr. David Alan Gilbert /*
394329c9b10SDr. David Alan Gilbert  * Main structure for IB Send/Recv control messages.
395329c9b10SDr. David Alan Gilbert  * This gets prepended at the beginning of every Send/Recv.
396329c9b10SDr. David Alan Gilbert  */
397329c9b10SDr. David Alan Gilbert typedef struct QEMU_PACKED {
398329c9b10SDr. David Alan Gilbert     uint32_t len;     /* Total length of data portion */
399329c9b10SDr. David Alan Gilbert     uint32_t type;    /* which control command to perform */
400329c9b10SDr. David Alan Gilbert     uint32_t repeat;  /* number of commands in data portion of same type */
401329c9b10SDr. David Alan Gilbert     uint32_t padding;
402329c9b10SDr. David Alan Gilbert } RDMAControlHeader;
403329c9b10SDr. David Alan Gilbert 
control_to_network(RDMAControlHeader * control)404329c9b10SDr. David Alan Gilbert static void control_to_network(RDMAControlHeader *control)
405329c9b10SDr. David Alan Gilbert {
406329c9b10SDr. David Alan Gilbert     control->type = htonl(control->type);
407329c9b10SDr. David Alan Gilbert     control->len = htonl(control->len);
408329c9b10SDr. David Alan Gilbert     control->repeat = htonl(control->repeat);
409329c9b10SDr. David Alan Gilbert }
410329c9b10SDr. David Alan Gilbert 
network_to_control(RDMAControlHeader * control)411329c9b10SDr. David Alan Gilbert static void network_to_control(RDMAControlHeader *control)
412329c9b10SDr. David Alan Gilbert {
413329c9b10SDr. David Alan Gilbert     control->type = ntohl(control->type);
414329c9b10SDr. David Alan Gilbert     control->len = ntohl(control->len);
415329c9b10SDr. David Alan Gilbert     control->repeat = ntohl(control->repeat);
416329c9b10SDr. David Alan Gilbert }
417329c9b10SDr. David Alan Gilbert 
418329c9b10SDr. David Alan Gilbert /*
419329c9b10SDr. David Alan Gilbert  * Register a single Chunk.
420329c9b10SDr. David Alan Gilbert  * Information sent by the source VM to inform the dest
421329c9b10SDr. David Alan Gilbert  * to register an single chunk of memory before we can perform
422329c9b10SDr. David Alan Gilbert  * the actual RDMA operation.
423329c9b10SDr. David Alan Gilbert  */
424329c9b10SDr. David Alan Gilbert typedef struct QEMU_PACKED {
425329c9b10SDr. David Alan Gilbert     union QEMU_PACKED {
426b12f7777SDr. David Alan Gilbert         uint64_t current_addr;  /* offset into the ram_addr_t space */
427329c9b10SDr. David Alan Gilbert         uint64_t chunk;         /* chunk to lookup if unregistering */
428329c9b10SDr. David Alan Gilbert     } key;
429329c9b10SDr. David Alan Gilbert     uint32_t current_index; /* which ramblock the chunk belongs to */
430329c9b10SDr. David Alan Gilbert     uint32_t padding;
431329c9b10SDr. David Alan Gilbert     uint64_t chunks;            /* how many sequential chunks to register */
432329c9b10SDr. David Alan Gilbert } RDMARegister;
433329c9b10SDr. David Alan Gilbert 
rdma_errored(RDMAContext * rdma)434b86c94a4SMarkus Armbruster static bool rdma_errored(RDMAContext *rdma)
435de3e05e8SMarkus Armbruster {
436b86c94a4SMarkus Armbruster     if (rdma->errored && !rdma->error_reported) {
437de3e05e8SMarkus Armbruster         error_report("RDMA is in an error state waiting migration"
438de3e05e8SMarkus Armbruster                      " to abort!");
439de3e05e8SMarkus Armbruster         rdma->error_reported = true;
440de3e05e8SMarkus Armbruster     }
441b86c94a4SMarkus Armbruster     return rdma->errored;
442de3e05e8SMarkus Armbruster }
443de3e05e8SMarkus Armbruster 
register_to_network(RDMAContext * rdma,RDMARegister * reg)444b12f7777SDr. David Alan Gilbert static void register_to_network(RDMAContext *rdma, RDMARegister *reg)
445329c9b10SDr. David Alan Gilbert {
446b12f7777SDr. David Alan Gilbert     RDMALocalBlock *local_block;
447b12f7777SDr. David Alan Gilbert     local_block  = &rdma->local_ram_blocks.block[reg->current_index];
448b12f7777SDr. David Alan Gilbert 
449b12f7777SDr. David Alan Gilbert     if (local_block->is_ram_block) {
450b12f7777SDr. David Alan Gilbert         /*
451b12f7777SDr. David Alan Gilbert          * current_addr as passed in is an address in the local ram_addr_t
452b12f7777SDr. David Alan Gilbert          * space, we need to translate this for the destination
453b12f7777SDr. David Alan Gilbert          */
454b12f7777SDr. David Alan Gilbert         reg->key.current_addr -= local_block->offset;
455b12f7777SDr. David Alan Gilbert         reg->key.current_addr += rdma->dest_blocks[reg->current_index].offset;
456b12f7777SDr. David Alan Gilbert     }
457329c9b10SDr. David Alan Gilbert     reg->key.current_addr = htonll(reg->key.current_addr);
458329c9b10SDr. David Alan Gilbert     reg->current_index = htonl(reg->current_index);
459329c9b10SDr. David Alan Gilbert     reg->chunks = htonll(reg->chunks);
460329c9b10SDr. David Alan Gilbert }
461329c9b10SDr. David Alan Gilbert 
network_to_register(RDMARegister * reg)462329c9b10SDr. David Alan Gilbert static void network_to_register(RDMARegister *reg)
463329c9b10SDr. David Alan Gilbert {
464329c9b10SDr. David Alan Gilbert     reg->key.current_addr = ntohll(reg->key.current_addr);
465329c9b10SDr. David Alan Gilbert     reg->current_index = ntohl(reg->current_index);
466329c9b10SDr. David Alan Gilbert     reg->chunks = ntohll(reg->chunks);
467329c9b10SDr. David Alan Gilbert }
468329c9b10SDr. David Alan Gilbert 
469329c9b10SDr. David Alan Gilbert typedef struct QEMU_PACKED {
470329c9b10SDr. David Alan Gilbert     uint32_t value;     /* if zero, we will madvise() */
471329c9b10SDr. David Alan Gilbert     uint32_t block_idx; /* which ram block index */
472b12f7777SDr. David Alan Gilbert     uint64_t offset;    /* Address in remote ram_addr_t space */
473329c9b10SDr. David Alan Gilbert     uint64_t length;    /* length of the chunk */
474329c9b10SDr. David Alan Gilbert } RDMACompress;
475329c9b10SDr. David Alan Gilbert 
compress_to_network(RDMAContext * rdma,RDMACompress * comp)476b12f7777SDr. David Alan Gilbert static void compress_to_network(RDMAContext *rdma, RDMACompress *comp)
477329c9b10SDr. David Alan Gilbert {
478329c9b10SDr. David Alan Gilbert     comp->value = htonl(comp->value);
479b12f7777SDr. David Alan Gilbert     /*
480b12f7777SDr. David Alan Gilbert      * comp->offset as passed in is an address in the local ram_addr_t
481b12f7777SDr. David Alan Gilbert      * space, we need to translate this for the destination
482b12f7777SDr. David Alan Gilbert      */
483b12f7777SDr. David Alan Gilbert     comp->offset -= rdma->local_ram_blocks.block[comp->block_idx].offset;
484b12f7777SDr. David Alan Gilbert     comp->offset += rdma->dest_blocks[comp->block_idx].offset;
485329c9b10SDr. David Alan Gilbert     comp->block_idx = htonl(comp->block_idx);
486329c9b10SDr. David Alan Gilbert     comp->offset = htonll(comp->offset);
487329c9b10SDr. David Alan Gilbert     comp->length = htonll(comp->length);
488329c9b10SDr. David Alan Gilbert }
489329c9b10SDr. David Alan Gilbert 
network_to_compress(RDMACompress * comp)490329c9b10SDr. David Alan Gilbert static void network_to_compress(RDMACompress *comp)
491329c9b10SDr. David Alan Gilbert {
492329c9b10SDr. David Alan Gilbert     comp->value = ntohl(comp->value);
493329c9b10SDr. David Alan Gilbert     comp->block_idx = ntohl(comp->block_idx);
494329c9b10SDr. David Alan Gilbert     comp->offset = ntohll(comp->offset);
495329c9b10SDr. David Alan Gilbert     comp->length = ntohll(comp->length);
496329c9b10SDr. David Alan Gilbert }
497329c9b10SDr. David Alan Gilbert 
498329c9b10SDr. David Alan Gilbert /*
499329c9b10SDr. David Alan Gilbert  * The result of the dest's memory registration produces an "rkey"
500329c9b10SDr. David Alan Gilbert  * which the source VM must reference in order to perform
501329c9b10SDr. David Alan Gilbert  * the RDMA operation.
502329c9b10SDr. David Alan Gilbert  */
503329c9b10SDr. David Alan Gilbert typedef struct QEMU_PACKED {
504329c9b10SDr. David Alan Gilbert     uint32_t rkey;
505329c9b10SDr. David Alan Gilbert     uint32_t padding;
506329c9b10SDr. David Alan Gilbert     uint64_t host_addr;
507329c9b10SDr. David Alan Gilbert } RDMARegisterResult;
508329c9b10SDr. David Alan Gilbert 
result_to_network(RDMARegisterResult * result)509329c9b10SDr. David Alan Gilbert static void result_to_network(RDMARegisterResult *result)
510329c9b10SDr. David Alan Gilbert {
511329c9b10SDr. David Alan Gilbert     result->rkey = htonl(result->rkey);
512329c9b10SDr. David Alan Gilbert     result->host_addr = htonll(result->host_addr);
513329c9b10SDr. David Alan Gilbert };
514329c9b10SDr. David Alan Gilbert 
network_to_result(RDMARegisterResult * result)515329c9b10SDr. David Alan Gilbert static void network_to_result(RDMARegisterResult *result)
516329c9b10SDr. David Alan Gilbert {
517329c9b10SDr. David Alan Gilbert     result->rkey = ntohl(result->rkey);
518329c9b10SDr. David Alan Gilbert     result->host_addr = ntohll(result->host_addr);
519329c9b10SDr. David Alan Gilbert };
520329c9b10SDr. David Alan Gilbert 
521329c9b10SDr. David Alan Gilbert static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
522329c9b10SDr. David Alan Gilbert                                    uint8_t *data, RDMAControlHeader *resp,
523329c9b10SDr. David Alan Gilbert                                    int *resp_idx,
524de1aa35fSMarkus Armbruster                                    int (*callback)(RDMAContext *rdma,
525de1aa35fSMarkus Armbruster                                                    Error **errp),
526c4c78dceSMarkus Armbruster                                    Error **errp);
527329c9b10SDr. David Alan Gilbert 
ram_chunk_index(const uint8_t * start,const uint8_t * host)528329c9b10SDr. David Alan Gilbert static inline uint64_t ram_chunk_index(const uint8_t *start,
529329c9b10SDr. David Alan Gilbert                                        const uint8_t *host)
530329c9b10SDr. David Alan Gilbert {
531329c9b10SDr. David Alan Gilbert     return ((uintptr_t) host - (uintptr_t) start) >> RDMA_REG_CHUNK_SHIFT;
532329c9b10SDr. David Alan Gilbert }
533329c9b10SDr. David Alan Gilbert 
ram_chunk_start(const RDMALocalBlock * rdma_ram_block,uint64_t i)534329c9b10SDr. David Alan Gilbert static inline uint8_t *ram_chunk_start(const RDMALocalBlock *rdma_ram_block,
535329c9b10SDr. David Alan Gilbert                                        uint64_t i)
536329c9b10SDr. David Alan Gilbert {
537fbce8c25SStefan Weil     return (uint8_t *)(uintptr_t)(rdma_ram_block->local_host_addr +
538fbce8c25SStefan Weil                                   (i << RDMA_REG_CHUNK_SHIFT));
539329c9b10SDr. David Alan Gilbert }
540329c9b10SDr. David Alan Gilbert 
ram_chunk_end(const RDMALocalBlock * rdma_ram_block,uint64_t i)541329c9b10SDr. David Alan Gilbert static inline uint8_t *ram_chunk_end(const RDMALocalBlock *rdma_ram_block,
542329c9b10SDr. David Alan Gilbert                                      uint64_t i)
543329c9b10SDr. David Alan Gilbert {
544329c9b10SDr. David Alan Gilbert     uint8_t *result = ram_chunk_start(rdma_ram_block, i) +
545329c9b10SDr. David Alan Gilbert                                          (1UL << RDMA_REG_CHUNK_SHIFT);
546329c9b10SDr. David Alan Gilbert 
547329c9b10SDr. David Alan Gilbert     if (result > (rdma_ram_block->local_host_addr + rdma_ram_block->length)) {
548329c9b10SDr. David Alan Gilbert         result = rdma_ram_block->local_host_addr + rdma_ram_block->length;
549329c9b10SDr. David Alan Gilbert     }
550329c9b10SDr. David Alan Gilbert 
551329c9b10SDr. David Alan Gilbert     return result;
552329c9b10SDr. David Alan Gilbert }
553329c9b10SDr. David Alan Gilbert 
rdma_add_block(RDMAContext * rdma,const char * block_name,void * host_addr,ram_addr_t block_offset,uint64_t length)5540610d7a1SMarkus Armbruster static void rdma_add_block(RDMAContext *rdma, const char *block_name,
5554fb5364bSDr. David Alan Gilbert                            void *host_addr,
556329c9b10SDr. David Alan Gilbert                            ram_addr_t block_offset, uint64_t length)
557329c9b10SDr. David Alan Gilbert {
558329c9b10SDr. David Alan Gilbert     RDMALocalBlocks *local = &rdma->local_ram_blocks;
559760ff4beSDr. David Alan Gilbert     RDMALocalBlock *block;
560329c9b10SDr. David Alan Gilbert     RDMALocalBlock *old = local->block;
561329c9b10SDr. David Alan Gilbert 
56297f3ad35SMarkus Armbruster     local->block = g_new0(RDMALocalBlock, local->nb_blocks + 1);
563329c9b10SDr. David Alan Gilbert 
564329c9b10SDr. David Alan Gilbert     if (local->nb_blocks) {
565760ff4beSDr. David Alan Gilbert         if (rdma->blockmap) {
56614e2fcbbSJuan Quintela             for (int x = 0; x < local->nb_blocks; x++) {
567fbce8c25SStefan Weil                 g_hash_table_remove(rdma->blockmap,
568fbce8c25SStefan Weil                                     (void *)(uintptr_t)old[x].offset);
569fbce8c25SStefan Weil                 g_hash_table_insert(rdma->blockmap,
570fbce8c25SStefan Weil                                     (void *)(uintptr_t)old[x].offset,
571329c9b10SDr. David Alan Gilbert                                     &local->block[x]);
572329c9b10SDr. David Alan Gilbert             }
573760ff4beSDr. David Alan Gilbert         }
574329c9b10SDr. David Alan Gilbert         memcpy(local->block, old, sizeof(RDMALocalBlock) * local->nb_blocks);
575329c9b10SDr. David Alan Gilbert         g_free(old);
576329c9b10SDr. David Alan Gilbert     }
577329c9b10SDr. David Alan Gilbert 
578329c9b10SDr. David Alan Gilbert     block = &local->block[local->nb_blocks];
579329c9b10SDr. David Alan Gilbert 
5804fb5364bSDr. David Alan Gilbert     block->block_name = g_strdup(block_name);
581329c9b10SDr. David Alan Gilbert     block->local_host_addr = host_addr;
582329c9b10SDr. David Alan Gilbert     block->offset = block_offset;
583329c9b10SDr. David Alan Gilbert     block->length = length;
584329c9b10SDr. David Alan Gilbert     block->index = local->nb_blocks;
585e4d63320SDr. David Alan Gilbert     block->src_index = ~0U; /* Filled in by the receipt of the block list */
586329c9b10SDr. David Alan Gilbert     block->nb_chunks = ram_chunk_index(host_addr, host_addr + length) + 1UL;
587329c9b10SDr. David Alan Gilbert     block->transit_bitmap = bitmap_new(block->nb_chunks);
588329c9b10SDr. David Alan Gilbert     bitmap_clear(block->transit_bitmap, 0, block->nb_chunks);
589329c9b10SDr. David Alan Gilbert     block->unregister_bitmap = bitmap_new(block->nb_chunks);
590329c9b10SDr. David Alan Gilbert     bitmap_clear(block->unregister_bitmap, 0, block->nb_chunks);
59197f3ad35SMarkus Armbruster     block->remote_keys = g_new0(uint32_t, block->nb_chunks);
592329c9b10SDr. David Alan Gilbert 
593329c9b10SDr. David Alan Gilbert     block->is_ram_block = local->init ? false : true;
594329c9b10SDr. David Alan Gilbert 
595760ff4beSDr. David Alan Gilbert     if (rdma->blockmap) {
59680e60c6eSJuan Quintela         g_hash_table_insert(rdma->blockmap, (void *)(uintptr_t)block_offset, block);
597760ff4beSDr. David Alan Gilbert     }
598329c9b10SDr. David Alan Gilbert 
5994fb5364bSDr. David Alan Gilbert     trace_rdma_add_block(block_name, local->nb_blocks,
6004fb5364bSDr. David Alan Gilbert                          (uintptr_t) block->local_host_addr,
601ba795761SDr. David Alan Gilbert                          block->offset, block->length,
602fbce8c25SStefan Weil                          (uintptr_t) (block->local_host_addr + block->length),
603329c9b10SDr. David Alan Gilbert                          BITS_TO_LONGS(block->nb_chunks) *
604733252deSDr. David Alan Gilbert                              sizeof(unsigned long) * 8,
605733252deSDr. David Alan Gilbert                          block->nb_chunks);
606329c9b10SDr. David Alan Gilbert 
607329c9b10SDr. David Alan Gilbert     local->nb_blocks++;
608329c9b10SDr. David Alan Gilbert }
609329c9b10SDr. David Alan Gilbert 
610329c9b10SDr. David Alan Gilbert /*
611329c9b10SDr. David Alan Gilbert  * Memory regions need to be registered with the device and queue pairs setup
612329c9b10SDr. David Alan Gilbert  * in advanced before the migration starts. This tells us where the RAM blocks
613329c9b10SDr. David Alan Gilbert  * are so that we can register them individually.
614329c9b10SDr. David Alan Gilbert  */
qemu_rdma_init_one_block(RAMBlock * rb,void * opaque)615754cb9c0SYury Kotov static int qemu_rdma_init_one_block(RAMBlock *rb, void *opaque)
616329c9b10SDr. David Alan Gilbert {
617754cb9c0SYury Kotov     const char *block_name = qemu_ram_get_idstr(rb);
618754cb9c0SYury Kotov     void *host_addr = qemu_ram_get_host_addr(rb);
619754cb9c0SYury Kotov     ram_addr_t block_offset = qemu_ram_get_offset(rb);
620754cb9c0SYury Kotov     ram_addr_t length = qemu_ram_get_used_length(rb);
6210610d7a1SMarkus Armbruster     rdma_add_block(opaque, block_name, host_addr, block_offset, length);
6220610d7a1SMarkus Armbruster     return 0;
623329c9b10SDr. David Alan Gilbert }
624329c9b10SDr. David Alan Gilbert 
625329c9b10SDr. David Alan Gilbert /*
626329c9b10SDr. David Alan Gilbert  * Identify the RAMBlocks and their quantity. They will be references to
627329c9b10SDr. David Alan Gilbert  * identify chunk boundaries inside each RAMBlock and also be referenced
628329c9b10SDr. David Alan Gilbert  * during dynamic page registration.
629329c9b10SDr. David Alan Gilbert  */
qemu_rdma_init_ram_blocks(RDMAContext * rdma)6300610d7a1SMarkus Armbruster static void qemu_rdma_init_ram_blocks(RDMAContext *rdma)
631329c9b10SDr. David Alan Gilbert {
632329c9b10SDr. David Alan Gilbert     RDMALocalBlocks *local = &rdma->local_ram_blocks;
633281496bbSDr. David Alan Gilbert     int ret;
634329c9b10SDr. David Alan Gilbert 
635329c9b10SDr. David Alan Gilbert     assert(rdma->blockmap == NULL);
636329c9b10SDr. David Alan Gilbert     memset(local, 0, sizeof *local);
637281496bbSDr. David Alan Gilbert     ret = foreach_not_ignored_block(qemu_rdma_init_one_block, rdma);
6380610d7a1SMarkus Armbruster     assert(!ret);
639733252deSDr. David Alan Gilbert     trace_qemu_rdma_init_ram_blocks(local->nb_blocks);
64097f3ad35SMarkus Armbruster     rdma->dest_blocks = g_new0(RDMADestBlock,
641329c9b10SDr. David Alan Gilbert                                rdma->local_ram_blocks.nb_blocks);
642329c9b10SDr. David Alan Gilbert     local->init = true;
643329c9b10SDr. David Alan Gilbert }
644329c9b10SDr. David Alan Gilbert 
64503fcab38SDr. David Alan Gilbert /*
64603fcab38SDr. David Alan Gilbert  * Note: If used outside of cleanup, the caller must ensure that the destination
64703fcab38SDr. David Alan Gilbert  * block structures are also updated
64803fcab38SDr. David Alan Gilbert  */
rdma_delete_block(RDMAContext * rdma,RDMALocalBlock * block)6491720a2a8SMarkus Armbruster static void rdma_delete_block(RDMAContext *rdma, RDMALocalBlock *block)
650329c9b10SDr. David Alan Gilbert {
651329c9b10SDr. David Alan Gilbert     RDMALocalBlocks *local = &rdma->local_ram_blocks;
652329c9b10SDr. David Alan Gilbert     RDMALocalBlock *old = local->block;
653329c9b10SDr. David Alan Gilbert 
65403fcab38SDr. David Alan Gilbert     if (rdma->blockmap) {
65503fcab38SDr. David Alan Gilbert         g_hash_table_remove(rdma->blockmap, (void *)(uintptr_t)block->offset);
65603fcab38SDr. David Alan Gilbert     }
657329c9b10SDr. David Alan Gilbert     if (block->pmr) {
65814e2fcbbSJuan Quintela         for (int j = 0; j < block->nb_chunks; j++) {
659329c9b10SDr. David Alan Gilbert             if (!block->pmr[j]) {
660329c9b10SDr. David Alan Gilbert                 continue;
661329c9b10SDr. David Alan Gilbert             }
662329c9b10SDr. David Alan Gilbert             ibv_dereg_mr(block->pmr[j]);
663329c9b10SDr. David Alan Gilbert             rdma->total_registrations--;
664329c9b10SDr. David Alan Gilbert         }
665329c9b10SDr. David Alan Gilbert         g_free(block->pmr);
666329c9b10SDr. David Alan Gilbert         block->pmr = NULL;
667329c9b10SDr. David Alan Gilbert     }
668329c9b10SDr. David Alan Gilbert 
669329c9b10SDr. David Alan Gilbert     if (block->mr) {
670329c9b10SDr. David Alan Gilbert         ibv_dereg_mr(block->mr);
671329c9b10SDr. David Alan Gilbert         rdma->total_registrations--;
672329c9b10SDr. David Alan Gilbert         block->mr = NULL;
673329c9b10SDr. David Alan Gilbert     }
674329c9b10SDr. David Alan Gilbert 
675329c9b10SDr. David Alan Gilbert     g_free(block->transit_bitmap);
676329c9b10SDr. David Alan Gilbert     block->transit_bitmap = NULL;
677329c9b10SDr. David Alan Gilbert 
678329c9b10SDr. David Alan Gilbert     g_free(block->unregister_bitmap);
679329c9b10SDr. David Alan Gilbert     block->unregister_bitmap = NULL;
680329c9b10SDr. David Alan Gilbert 
681329c9b10SDr. David Alan Gilbert     g_free(block->remote_keys);
682329c9b10SDr. David Alan Gilbert     block->remote_keys = NULL;
683329c9b10SDr. David Alan Gilbert 
6844fb5364bSDr. David Alan Gilbert     g_free(block->block_name);
6854fb5364bSDr. David Alan Gilbert     block->block_name = NULL;
6864fb5364bSDr. David Alan Gilbert 
68703fcab38SDr. David Alan Gilbert     if (rdma->blockmap) {
68814e2fcbbSJuan Quintela         for (int x = 0; x < local->nb_blocks; x++) {
68903fcab38SDr. David Alan Gilbert             g_hash_table_remove(rdma->blockmap,
69003fcab38SDr. David Alan Gilbert                                 (void *)(uintptr_t)old[x].offset);
69103fcab38SDr. David Alan Gilbert         }
692329c9b10SDr. David Alan Gilbert     }
693329c9b10SDr. David Alan Gilbert 
694329c9b10SDr. David Alan Gilbert     if (local->nb_blocks > 1) {
695329c9b10SDr. David Alan Gilbert 
69697f3ad35SMarkus Armbruster         local->block = g_new0(RDMALocalBlock, local->nb_blocks - 1);
697329c9b10SDr. David Alan Gilbert 
698329c9b10SDr. David Alan Gilbert         if (block->index) {
699329c9b10SDr. David Alan Gilbert             memcpy(local->block, old, sizeof(RDMALocalBlock) * block->index);
700329c9b10SDr. David Alan Gilbert         }
701329c9b10SDr. David Alan Gilbert 
702329c9b10SDr. David Alan Gilbert         if (block->index < (local->nb_blocks - 1)) {
703329c9b10SDr. David Alan Gilbert             memcpy(local->block + block->index, old + (block->index + 1),
704329c9b10SDr. David Alan Gilbert                 sizeof(RDMALocalBlock) *
705329c9b10SDr. David Alan Gilbert                     (local->nb_blocks - (block->index + 1)));
70614e2fcbbSJuan Quintela             for (int x = block->index; x < local->nb_blocks - 1; x++) {
70771cd7306SLidong Chen                 local->block[x].index--;
70871cd7306SLidong Chen             }
709329c9b10SDr. David Alan Gilbert         }
710329c9b10SDr. David Alan Gilbert     } else {
711329c9b10SDr. David Alan Gilbert         assert(block == local->block);
712329c9b10SDr. David Alan Gilbert         local->block = NULL;
713329c9b10SDr. David Alan Gilbert     }
714329c9b10SDr. David Alan Gilbert 
71503fcab38SDr. David Alan Gilbert     trace_rdma_delete_block(block, (uintptr_t)block->local_host_addr,
716733252deSDr. David Alan Gilbert                            block->offset, block->length,
717fbce8c25SStefan Weil                             (uintptr_t)(block->local_host_addr + block->length),
718329c9b10SDr. David Alan Gilbert                            BITS_TO_LONGS(block->nb_chunks) *
719329c9b10SDr. David Alan Gilbert                                sizeof(unsigned long) * 8, block->nb_chunks);
720329c9b10SDr. David Alan Gilbert 
721329c9b10SDr. David Alan Gilbert     g_free(old);
722329c9b10SDr. David Alan Gilbert 
723329c9b10SDr. David Alan Gilbert     local->nb_blocks--;
724329c9b10SDr. David Alan Gilbert 
72503fcab38SDr. David Alan Gilbert     if (local->nb_blocks && rdma->blockmap) {
72614e2fcbbSJuan Quintela         for (int x = 0; x < local->nb_blocks; x++) {
727fbce8c25SStefan Weil             g_hash_table_insert(rdma->blockmap,
728fbce8c25SStefan Weil                                 (void *)(uintptr_t)local->block[x].offset,
729329c9b10SDr. David Alan Gilbert                                 &local->block[x]);
730329c9b10SDr. David Alan Gilbert         }
731329c9b10SDr. David Alan Gilbert     }
732329c9b10SDr. David Alan Gilbert }
733329c9b10SDr. David Alan Gilbert 
734329c9b10SDr. David Alan Gilbert /*
7352c88739cSMarkus Armbruster  * Trace RDMA device open, with device details.
736329c9b10SDr. David Alan Gilbert  */
qemu_rdma_dump_id(const char * who,struct ibv_context * verbs)737329c9b10SDr. David Alan Gilbert static void qemu_rdma_dump_id(const char *who, struct ibv_context *verbs)
738329c9b10SDr. David Alan Gilbert {
739329c9b10SDr. David Alan Gilbert     struct ibv_port_attr port;
740329c9b10SDr. David Alan Gilbert 
741329c9b10SDr. David Alan Gilbert     if (ibv_query_port(verbs, 1, &port)) {
7422c88739cSMarkus Armbruster         trace_qemu_rdma_dump_id_failed(who);
743329c9b10SDr. David Alan Gilbert         return;
744329c9b10SDr. David Alan Gilbert     }
745329c9b10SDr. David Alan Gilbert 
7462c88739cSMarkus Armbruster     trace_qemu_rdma_dump_id(who,
747329c9b10SDr. David Alan Gilbert                 verbs->device->name,
748329c9b10SDr. David Alan Gilbert                 verbs->device->dev_name,
749329c9b10SDr. David Alan Gilbert                 verbs->device->dev_path,
750329c9b10SDr. David Alan Gilbert                 verbs->device->ibdev_path,
751329c9b10SDr. David Alan Gilbert                 port.link_layer,
7522c88739cSMarkus Armbruster                 port.link_layer == IBV_LINK_LAYER_INFINIBAND ? "Infiniband"
7532c88739cSMarkus Armbruster                 : port.link_layer == IBV_LINK_LAYER_ETHERNET ? "Ethernet"
7542c88739cSMarkus Armbruster                 : "Unknown");
755329c9b10SDr. David Alan Gilbert }
756329c9b10SDr. David Alan Gilbert 
757329c9b10SDr. David Alan Gilbert /*
7582c88739cSMarkus Armbruster  * Trace RDMA gid addressing information.
7592c88739cSMarkus Armbruster  * Useful for understanding the RDMA device hierarchy in the kernel.
760329c9b10SDr. David Alan Gilbert  */
qemu_rdma_dump_gid(const char * who,struct rdma_cm_id * id)761329c9b10SDr. David Alan Gilbert static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id)
762329c9b10SDr. David Alan Gilbert {
763329c9b10SDr. David Alan Gilbert     char sgid[33];
764329c9b10SDr. David Alan Gilbert     char dgid[33];
765329c9b10SDr. David Alan Gilbert     inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.sgid, sgid, sizeof sgid);
766329c9b10SDr. David Alan Gilbert     inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.dgid, dgid, sizeof dgid);
767733252deSDr. David Alan Gilbert     trace_qemu_rdma_dump_gid(who, sgid, dgid);
768329c9b10SDr. David Alan Gilbert }
769329c9b10SDr. David Alan Gilbert 
770329c9b10SDr. David Alan Gilbert /*
771329c9b10SDr. David Alan Gilbert  * As of now, IPv6 over RoCE / iWARP is not supported by linux.
772329c9b10SDr. David Alan Gilbert  * We will try the next addrinfo struct, and fail if there are
773329c9b10SDr. David Alan Gilbert  * no other valid addresses to bind against.
774329c9b10SDr. David Alan Gilbert  *
775329c9b10SDr. David Alan Gilbert  * If user is listening on '[::]', then we will not have a opened a device
776329c9b10SDr. David Alan Gilbert  * yet and have no way of verifying if the device is RoCE or not.
777329c9b10SDr. David Alan Gilbert  *
778329c9b10SDr. David Alan Gilbert  * In this case, the source VM will throw an error for ALL types of
779329c9b10SDr. David Alan Gilbert  * connections (both IPv4 and IPv6) if the destination machine does not have
780329c9b10SDr. David Alan Gilbert  * a regular infiniband network available for use.
781329c9b10SDr. David Alan Gilbert  *
782329c9b10SDr. David Alan Gilbert  * The only way to guarantee that an error is thrown for broken kernels is
783329c9b10SDr. David Alan Gilbert  * for the management software to choose a *specific* interface at bind time
784329c9b10SDr. David Alan Gilbert  * and validate what time of hardware it is.
785329c9b10SDr. David Alan Gilbert  *
786329c9b10SDr. David Alan Gilbert  * Unfortunately, this puts the user in a fix:
787329c9b10SDr. David Alan Gilbert  *
788329c9b10SDr. David Alan Gilbert  *  If the source VM connects with an IPv4 address without knowing that the
789329c9b10SDr. David Alan Gilbert  *  destination has bound to '[::]' the migration will unconditionally fail
790b6af0975SDaniel P. Berrange  *  unless the management software is explicitly listening on the IPv4
791329c9b10SDr. David Alan Gilbert  *  address while using a RoCE-based device.
792329c9b10SDr. David Alan Gilbert  *
793329c9b10SDr. David Alan Gilbert  *  If the source VM connects with an IPv6 address, then we're OK because we can
794329c9b10SDr. David Alan Gilbert  *  throw an error on the source (and similarly on the destination).
795329c9b10SDr. David Alan Gilbert  *
796329c9b10SDr. David Alan Gilbert  *  But in mixed environments, this will be broken for a while until it is fixed
797329c9b10SDr. David Alan Gilbert  *  inside linux.
798329c9b10SDr. David Alan Gilbert  *
799329c9b10SDr. David Alan Gilbert  * We do provide a *tiny* bit of help in this function: We can list all of the
800329c9b10SDr. David Alan Gilbert  * devices in the system and check to see if all the devices are RoCE or
801329c9b10SDr. David Alan Gilbert  * Infiniband.
802329c9b10SDr. David Alan Gilbert  *
803329c9b10SDr. David Alan Gilbert  * If we detect that we have a *pure* RoCE environment, then we can safely
804329c9b10SDr. David Alan Gilbert  * thrown an error even if the management software has specified '[::]' as the
805329c9b10SDr. David Alan Gilbert  * bind address.
806329c9b10SDr. David Alan Gilbert  *
807329c9b10SDr. David Alan Gilbert  * However, if there is are multiple hetergeneous devices, then we cannot make
808329c9b10SDr. David Alan Gilbert  * this assumption and the user just has to be sure they know what they are
809329c9b10SDr. David Alan Gilbert  * doing.
810329c9b10SDr. David Alan Gilbert  *
811329c9b10SDr. David Alan Gilbert  * Patches are being reviewed on linux-rdma.
812329c9b10SDr. David Alan Gilbert  */
qemu_rdma_broken_ipv6_kernel(struct ibv_context * verbs,Error ** errp)813bbfb89e3SFam Zheng static int qemu_rdma_broken_ipv6_kernel(struct ibv_context *verbs, Error **errp)
814329c9b10SDr. David Alan Gilbert {
815329c9b10SDr. David Alan Gilbert     /* This bug only exists in linux, to our knowledge. */
816329c9b10SDr. David Alan Gilbert #ifdef CONFIG_LINUX
8171f4abd81SAlex Bennée     struct ibv_port_attr port_attr;
818329c9b10SDr. David Alan Gilbert 
819329c9b10SDr. David Alan Gilbert     /*
820329c9b10SDr. David Alan Gilbert      * Verbs are only NULL if management has bound to '[::]'.
821329c9b10SDr. David Alan Gilbert      *
822329c9b10SDr. David Alan Gilbert      * Let's iterate through all the devices and see if there any pure IB
823329c9b10SDr. David Alan Gilbert      * devices (non-ethernet).
824329c9b10SDr. David Alan Gilbert      *
825329c9b10SDr. David Alan Gilbert      * If not, then we can safely proceed with the migration.
826329c9b10SDr. David Alan Gilbert      * Otherwise, there are no guarantees until the bug is fixed in linux.
827329c9b10SDr. David Alan Gilbert      */
828329c9b10SDr. David Alan Gilbert     if (!verbs) {
82914e2fcbbSJuan Quintela         int num_devices;
830329c9b10SDr. David Alan Gilbert         struct ibv_device **dev_list = ibv_get_device_list(&num_devices);
831329c9b10SDr. David Alan Gilbert         bool roce_found = false;
832329c9b10SDr. David Alan Gilbert         bool ib_found = false;
833329c9b10SDr. David Alan Gilbert 
83414e2fcbbSJuan Quintela         for (int x = 0; x < num_devices; x++) {
835329c9b10SDr. David Alan Gilbert             verbs = ibv_open_device(dev_list[x]);
8360bc26045SMarkus Armbruster             /*
8370bc26045SMarkus Armbruster              * ibv_open_device() is not documented to set errno.  If
8380bc26045SMarkus Armbruster              * it does, it's somebody else's doc bug.  If it doesn't,
8390bc26045SMarkus Armbruster              * the use of errno below is wrong.
8400bc26045SMarkus Armbruster              * TODO Find out whether ibv_open_device() sets errno.
8410bc26045SMarkus Armbruster              */
8425b61d575SPadmanabh Ratnakar             if (!verbs) {
8435b61d575SPadmanabh Ratnakar                 if (errno == EPERM) {
8445b61d575SPadmanabh Ratnakar                     continue;
8455b61d575SPadmanabh Ratnakar                 } else {
846142bd685SMarkus Armbruster                     error_setg_errno(errp, errno,
847142bd685SMarkus Armbruster                                      "could not open RDMA device context");
8488c6513f7SMarkus Armbruster                     return -1;
8495b61d575SPadmanabh Ratnakar                 }
8505b61d575SPadmanabh Ratnakar             }
851329c9b10SDr. David Alan Gilbert 
852329c9b10SDr. David Alan Gilbert             if (ibv_query_port(verbs, 1, &port_attr)) {
853329c9b10SDr. David Alan Gilbert                 ibv_close_device(verbs);
8548fd471bdSMarkus Armbruster                 error_setg(errp,
8558fd471bdSMarkus Armbruster                            "RDMA ERROR: Could not query initial IB port");
8568c6513f7SMarkus Armbruster                 return -1;
857329c9b10SDr. David Alan Gilbert             }
858329c9b10SDr. David Alan Gilbert 
859329c9b10SDr. David Alan Gilbert             if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) {
860329c9b10SDr. David Alan Gilbert                 ib_found = true;
861329c9b10SDr. David Alan Gilbert             } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
862329c9b10SDr. David Alan Gilbert                 roce_found = true;
863329c9b10SDr. David Alan Gilbert             }
864329c9b10SDr. David Alan Gilbert 
865329c9b10SDr. David Alan Gilbert             ibv_close_device(verbs);
866329c9b10SDr. David Alan Gilbert 
867329c9b10SDr. David Alan Gilbert         }
868329c9b10SDr. David Alan Gilbert 
869329c9b10SDr. David Alan Gilbert         if (roce_found) {
870329c9b10SDr. David Alan Gilbert             if (ib_found) {
871ff4c9194SMarkus Armbruster                 warn_report("migrations may fail:"
872329c9b10SDr. David Alan Gilbert                             " IPv6 over RoCE / iWARP in linux"
873329c9b10SDr. David Alan Gilbert                             " is broken. But since you appear to have a"
874329c9b10SDr. David Alan Gilbert                             " mixed RoCE / IB environment, be sure to only"
875329c9b10SDr. David Alan Gilbert                             " migrate over the IB fabric until the kernel "
876ff4c9194SMarkus Armbruster                             " fixes the bug.");
877329c9b10SDr. David Alan Gilbert             } else {
8788fd471bdSMarkus Armbruster                 error_setg(errp, "RDMA ERROR: "
8798fd471bdSMarkus Armbruster                            "You only have RoCE / iWARP devices in your systems"
880329c9b10SDr. David Alan Gilbert                            " and your management software has specified '[::]'"
881329c9b10SDr. David Alan Gilbert                            ", but IPv6 over RoCE / iWARP is not supported in Linux.");
8828c6513f7SMarkus Armbruster                 return -1;
883329c9b10SDr. David Alan Gilbert             }
884329c9b10SDr. David Alan Gilbert         }
885329c9b10SDr. David Alan Gilbert 
886329c9b10SDr. David Alan Gilbert         return 0;
887329c9b10SDr. David Alan Gilbert     }
888329c9b10SDr. David Alan Gilbert 
889329c9b10SDr. David Alan Gilbert     /*
890329c9b10SDr. David Alan Gilbert      * If we have a verbs context, that means that some other than '[::]' was
89102942db7SStefan Weil      * used by the management software for binding. In which case we can
89202942db7SStefan Weil      * actually warn the user about a potentially broken kernel.
893329c9b10SDr. David Alan Gilbert      */
894329c9b10SDr. David Alan Gilbert 
895329c9b10SDr. David Alan Gilbert     /* IB ports start with 1, not 0 */
896329c9b10SDr. David Alan Gilbert     if (ibv_query_port(verbs, 1, &port_attr)) {
8978fd471bdSMarkus Armbruster         error_setg(errp, "RDMA ERROR: Could not query initial IB port");
8988c6513f7SMarkus Armbruster         return -1;
899329c9b10SDr. David Alan Gilbert     }
900329c9b10SDr. David Alan Gilbert 
901329c9b10SDr. David Alan Gilbert     if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
9028fd471bdSMarkus Armbruster         error_setg(errp, "RDMA ERROR: "
9038fd471bdSMarkus Armbruster                    "Linux kernel's RoCE / iWARP does not support IPv6 "
904329c9b10SDr. David Alan Gilbert                    "(but patches on linux-rdma in progress)");
9058c6513f7SMarkus Armbruster         return -1;
906329c9b10SDr. David Alan Gilbert     }
907329c9b10SDr. David Alan Gilbert 
908329c9b10SDr. David Alan Gilbert #endif
909329c9b10SDr. David Alan Gilbert 
910329c9b10SDr. David Alan Gilbert     return 0;
911329c9b10SDr. David Alan Gilbert }
912329c9b10SDr. David Alan Gilbert 
913329c9b10SDr. David Alan Gilbert /*
914329c9b10SDr. David Alan Gilbert  * Figure out which RDMA device corresponds to the requested IP hostname
915329c9b10SDr. David Alan Gilbert  * Also create the initial connection manager identifiers for opening
916329c9b10SDr. David Alan Gilbert  * the connection.
917329c9b10SDr. David Alan Gilbert  */
qemu_rdma_resolve_host(RDMAContext * rdma,Error ** errp)918329c9b10SDr. David Alan Gilbert static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp)
919329c9b10SDr. David Alan Gilbert {
920071d5ae4SMarkus Armbruster     Error *err = NULL;
921329c9b10SDr. David Alan Gilbert     int ret;
922329c9b10SDr. David Alan Gilbert     struct rdma_addrinfo *res;
923329c9b10SDr. David Alan Gilbert     char port_str[16];
924329c9b10SDr. David Alan Gilbert     struct rdma_cm_event *cm_event;
925329c9b10SDr. David Alan Gilbert     char ip[40] = "unknown";
926329c9b10SDr. David Alan Gilbert 
927329c9b10SDr. David Alan Gilbert     if (rdma->host == NULL || !strcmp(rdma->host, "")) {
9288fd471bdSMarkus Armbruster         error_setg(errp, "RDMA ERROR: RDMA hostname has not been set");
92907249822SMarkus Armbruster         return -1;
930329c9b10SDr. David Alan Gilbert     }
931329c9b10SDr. David Alan Gilbert 
932329c9b10SDr. David Alan Gilbert     /* create CM channel */
933329c9b10SDr. David Alan Gilbert     rdma->channel = rdma_create_event_channel();
934329c9b10SDr. David Alan Gilbert     if (!rdma->channel) {
9358fd471bdSMarkus Armbruster         error_setg(errp, "RDMA ERROR: could not create CM channel");
93607249822SMarkus Armbruster         return -1;
937329c9b10SDr. David Alan Gilbert     }
938329c9b10SDr. David Alan Gilbert 
939329c9b10SDr. David Alan Gilbert     /* create CM id */
940329c9b10SDr. David Alan Gilbert     ret = rdma_create_id(rdma->channel, &rdma->cm_id, NULL, RDMA_PS_TCP);
9414a102179SMarkus Armbruster     if (ret < 0) {
9428fd471bdSMarkus Armbruster         error_setg(errp, "RDMA ERROR: could not create channel id");
943329c9b10SDr. David Alan Gilbert         goto err_resolve_create_id;
944329c9b10SDr. David Alan Gilbert     }
945329c9b10SDr. David Alan Gilbert 
946329c9b10SDr. David Alan Gilbert     snprintf(port_str, 16, "%d", rdma->port);
947329c9b10SDr. David Alan Gilbert     port_str[15] = '\0';
948329c9b10SDr. David Alan Gilbert 
949329c9b10SDr. David Alan Gilbert     ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
95007249822SMarkus Armbruster     if (ret) {
9518fd471bdSMarkus Armbruster         error_setg(errp, "RDMA ERROR: could not rdma_getaddrinfo address %s",
9528fd471bdSMarkus Armbruster                    rdma->host);
953329c9b10SDr. David Alan Gilbert         goto err_resolve_get_addr;
954329c9b10SDr. David Alan Gilbert     }
955329c9b10SDr. David Alan Gilbert 
956071d5ae4SMarkus Armbruster     /* Try all addresses, saving the first error in @err */
95714e2fcbbSJuan Quintela     for (struct rdma_addrinfo *e = res; e != NULL; e = e->ai_next) {
958071d5ae4SMarkus Armbruster         Error **local_errp = err ? NULL : &err;
959071d5ae4SMarkus Armbruster 
960329c9b10SDr. David Alan Gilbert         inet_ntop(e->ai_family,
961329c9b10SDr. David Alan Gilbert             &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
962733252deSDr. David Alan Gilbert         trace_qemu_rdma_resolve_host_trying(rdma->host, ip);
963329c9b10SDr. David Alan Gilbert 
964329c9b10SDr. David Alan Gilbert         ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr,
965329c9b10SDr. David Alan Gilbert                 RDMA_RESOLVE_TIMEOUT_MS);
9664a102179SMarkus Armbruster         if (ret >= 0) {
967329c9b10SDr. David Alan Gilbert             if (e->ai_family == AF_INET6) {
968071d5ae4SMarkus Armbruster                 ret = qemu_rdma_broken_ipv6_kernel(rdma->cm_id->verbs,
969071d5ae4SMarkus Armbruster                                                    local_errp);
9704a102179SMarkus Armbruster                 if (ret < 0) {
971329c9b10SDr. David Alan Gilbert                     continue;
972329c9b10SDr. David Alan Gilbert                 }
973329c9b10SDr. David Alan Gilbert             }
974071d5ae4SMarkus Armbruster             error_free(err);
975329c9b10SDr. David Alan Gilbert             goto route;
976329c9b10SDr. David Alan Gilbert         }
977329c9b10SDr. David Alan Gilbert     }
978329c9b10SDr. David Alan Gilbert 
979f53b450aSLi Zhijian     rdma_freeaddrinfo(res);
980071d5ae4SMarkus Armbruster     if (err) {
981071d5ae4SMarkus Armbruster         error_propagate(errp, err);
982071d5ae4SMarkus Armbruster     } else {
9838fd471bdSMarkus Armbruster         error_setg(errp, "RDMA ERROR: could not resolve address %s",
9848fd471bdSMarkus Armbruster                    rdma->host);
9858fd471bdSMarkus Armbruster     }
986329c9b10SDr. David Alan Gilbert     goto err_resolve_get_addr;
987329c9b10SDr. David Alan Gilbert 
988329c9b10SDr. David Alan Gilbert route:
989f53b450aSLi Zhijian     rdma_freeaddrinfo(res);
990329c9b10SDr. David Alan Gilbert     qemu_rdma_dump_gid("source_resolve_addr", rdma->cm_id);
991329c9b10SDr. David Alan Gilbert 
992329c9b10SDr. David Alan Gilbert     ret = rdma_get_cm_event(rdma->channel, &cm_event);
9934a102179SMarkus Armbruster     if (ret < 0) {
9948fd471bdSMarkus Armbruster         error_setg(errp, "RDMA ERROR: could not perform event_addr_resolved");
995329c9b10SDr. David Alan Gilbert         goto err_resolve_get_addr;
996329c9b10SDr. David Alan Gilbert     }
997329c9b10SDr. David Alan Gilbert 
998329c9b10SDr. David Alan Gilbert     if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
9998fd471bdSMarkus Armbruster         error_setg(errp,
10008fd471bdSMarkus Armbruster                    "RDMA ERROR: result not equal to event_addr_resolved %s",
1001329c9b10SDr. David Alan Gilbert                    rdma_event_str(cm_event->event));
1002329c9b10SDr. David Alan Gilbert         rdma_ack_cm_event(cm_event);
1003329c9b10SDr. David Alan Gilbert         goto err_resolve_get_addr;
1004329c9b10SDr. David Alan Gilbert     }
1005329c9b10SDr. David Alan Gilbert     rdma_ack_cm_event(cm_event);
1006329c9b10SDr. David Alan Gilbert 
1007329c9b10SDr. David Alan Gilbert     /* resolve route */
1008329c9b10SDr. David Alan Gilbert     ret = rdma_resolve_route(rdma->cm_id, RDMA_RESOLVE_TIMEOUT_MS);
10094a102179SMarkus Armbruster     if (ret < 0) {
10108fd471bdSMarkus Armbruster         error_setg(errp, "RDMA ERROR: could not resolve rdma route");
1011329c9b10SDr. David Alan Gilbert         goto err_resolve_get_addr;
1012329c9b10SDr. David Alan Gilbert     }
1013329c9b10SDr. David Alan Gilbert 
1014329c9b10SDr. David Alan Gilbert     ret = rdma_get_cm_event(rdma->channel, &cm_event);
10154a102179SMarkus Armbruster     if (ret < 0) {
10168fd471bdSMarkus Armbruster         error_setg(errp, "RDMA ERROR: could not perform event_route_resolved");
1017329c9b10SDr. David Alan Gilbert         goto err_resolve_get_addr;
1018329c9b10SDr. David Alan Gilbert     }
1019329c9b10SDr. David Alan Gilbert     if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
10208fd471bdSMarkus Armbruster         error_setg(errp, "RDMA ERROR: "
10218fd471bdSMarkus Armbruster                    "result not equal to event_route_resolved: %s",
1022329c9b10SDr. David Alan Gilbert                    rdma_event_str(cm_event->event));
1023329c9b10SDr. David Alan Gilbert         rdma_ack_cm_event(cm_event);
1024329c9b10SDr. David Alan Gilbert         goto err_resolve_get_addr;
1025329c9b10SDr. David Alan Gilbert     }
1026329c9b10SDr. David Alan Gilbert     rdma_ack_cm_event(cm_event);
1027329c9b10SDr. David Alan Gilbert     rdma->verbs = rdma->cm_id->verbs;
1028329c9b10SDr. David Alan Gilbert     qemu_rdma_dump_id("source_resolve_host", rdma->cm_id->verbs);
1029329c9b10SDr. David Alan Gilbert     qemu_rdma_dump_gid("source_resolve_host", rdma->cm_id);
1030329c9b10SDr. David Alan Gilbert     return 0;
1031329c9b10SDr. David Alan Gilbert 
1032329c9b10SDr. David Alan Gilbert err_resolve_get_addr:
1033329c9b10SDr. David Alan Gilbert     rdma_destroy_id(rdma->cm_id);
1034329c9b10SDr. David Alan Gilbert     rdma->cm_id = NULL;
1035329c9b10SDr. David Alan Gilbert err_resolve_create_id:
1036329c9b10SDr. David Alan Gilbert     rdma_destroy_event_channel(rdma->channel);
1037329c9b10SDr. David Alan Gilbert     rdma->channel = NULL;
103807249822SMarkus Armbruster     return -1;
1039329c9b10SDr. David Alan Gilbert }
1040329c9b10SDr. David Alan Gilbert 
1041329c9b10SDr. David Alan Gilbert /*
1042329c9b10SDr. David Alan Gilbert  * Create protection domain and completion queues
1043329c9b10SDr. David Alan Gilbert  */
qemu_rdma_alloc_pd_cq(RDMAContext * rdma,Error ** errp)104407d5b946SMarkus Armbruster static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma, Error **errp)
1045329c9b10SDr. David Alan Gilbert {
1046329c9b10SDr. David Alan Gilbert     /* allocate pd */
1047329c9b10SDr. David Alan Gilbert     rdma->pd = ibv_alloc_pd(rdma->verbs);
1048329c9b10SDr. David Alan Gilbert     if (!rdma->pd) {
104907d5b946SMarkus Armbruster         error_setg(errp, "failed to allocate protection domain");
1050329c9b10SDr. David Alan Gilbert         return -1;
1051329c9b10SDr. David Alan Gilbert     }
1052329c9b10SDr. David Alan Gilbert 
1053b390afd8SLi Zhijian     /* create receive completion channel */
1054b390afd8SLi Zhijian     rdma->recv_comp_channel = ibv_create_comp_channel(rdma->verbs);
1055b390afd8SLi Zhijian     if (!rdma->recv_comp_channel) {
105607d5b946SMarkus Armbruster         error_setg(errp, "failed to allocate receive completion channel");
1057329c9b10SDr. David Alan Gilbert         goto err_alloc_pd_cq;
1058329c9b10SDr. David Alan Gilbert     }
1059329c9b10SDr. David Alan Gilbert 
1060329c9b10SDr. David Alan Gilbert     /*
1061b390afd8SLi Zhijian      * Completion queue can be filled by read work requests.
1062329c9b10SDr. David Alan Gilbert      */
1063b390afd8SLi Zhijian     rdma->recv_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
1064b390afd8SLi Zhijian                                   NULL, rdma->recv_comp_channel, 0);
1065b390afd8SLi Zhijian     if (!rdma->recv_cq) {
106607d5b946SMarkus Armbruster         error_setg(errp, "failed to allocate receive completion queue");
1067b390afd8SLi Zhijian         goto err_alloc_pd_cq;
1068b390afd8SLi Zhijian     }
1069b390afd8SLi Zhijian 
1070b390afd8SLi Zhijian     /* create send completion channel */
1071b390afd8SLi Zhijian     rdma->send_comp_channel = ibv_create_comp_channel(rdma->verbs);
1072b390afd8SLi Zhijian     if (!rdma->send_comp_channel) {
107307d5b946SMarkus Armbruster         error_setg(errp, "failed to allocate send completion channel");
1074b390afd8SLi Zhijian         goto err_alloc_pd_cq;
1075b390afd8SLi Zhijian     }
1076b390afd8SLi Zhijian 
1077b390afd8SLi Zhijian     rdma->send_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
1078b390afd8SLi Zhijian                                   NULL, rdma->send_comp_channel, 0);
1079b390afd8SLi Zhijian     if (!rdma->send_cq) {
108007d5b946SMarkus Armbruster         error_setg(errp, "failed to allocate send completion queue");
1081329c9b10SDr. David Alan Gilbert         goto err_alloc_pd_cq;
1082329c9b10SDr. David Alan Gilbert     }
1083329c9b10SDr. David Alan Gilbert 
1084329c9b10SDr. David Alan Gilbert     return 0;
1085329c9b10SDr. David Alan Gilbert 
1086329c9b10SDr. David Alan Gilbert err_alloc_pd_cq:
1087329c9b10SDr. David Alan Gilbert     if (rdma->pd) {
1088329c9b10SDr. David Alan Gilbert         ibv_dealloc_pd(rdma->pd);
1089329c9b10SDr. David Alan Gilbert     }
1090b390afd8SLi Zhijian     if (rdma->recv_comp_channel) {
1091b390afd8SLi Zhijian         ibv_destroy_comp_channel(rdma->recv_comp_channel);
1092b390afd8SLi Zhijian     }
1093b390afd8SLi Zhijian     if (rdma->send_comp_channel) {
1094b390afd8SLi Zhijian         ibv_destroy_comp_channel(rdma->send_comp_channel);
1095b390afd8SLi Zhijian     }
1096b390afd8SLi Zhijian     if (rdma->recv_cq) {
1097b390afd8SLi Zhijian         ibv_destroy_cq(rdma->recv_cq);
1098b390afd8SLi Zhijian         rdma->recv_cq = NULL;
1099329c9b10SDr. David Alan Gilbert     }
1100329c9b10SDr. David Alan Gilbert     rdma->pd = NULL;
1101b390afd8SLi Zhijian     rdma->recv_comp_channel = NULL;
1102b390afd8SLi Zhijian     rdma->send_comp_channel = NULL;
1103329c9b10SDr. David Alan Gilbert     return -1;
1104329c9b10SDr. David Alan Gilbert 
1105329c9b10SDr. David Alan Gilbert }
1106329c9b10SDr. David Alan Gilbert 
1107329c9b10SDr. David Alan Gilbert /*
1108329c9b10SDr. David Alan Gilbert  * Create queue pairs.
1109329c9b10SDr. David Alan Gilbert  */
qemu_rdma_alloc_qp(RDMAContext * rdma)1110329c9b10SDr. David Alan Gilbert static int qemu_rdma_alloc_qp(RDMAContext *rdma)
1111329c9b10SDr. David Alan Gilbert {
1112329c9b10SDr. David Alan Gilbert     struct ibv_qp_init_attr attr = { 0 };
1113329c9b10SDr. David Alan Gilbert 
1114329c9b10SDr. David Alan Gilbert     attr.cap.max_send_wr = RDMA_SIGNALED_SEND_MAX;
1115329c9b10SDr. David Alan Gilbert     attr.cap.max_recv_wr = 3;
1116329c9b10SDr. David Alan Gilbert     attr.cap.max_send_sge = 1;
1117329c9b10SDr. David Alan Gilbert     attr.cap.max_recv_sge = 1;
1118b390afd8SLi Zhijian     attr.send_cq = rdma->send_cq;
1119b390afd8SLi Zhijian     attr.recv_cq = rdma->recv_cq;
1120329c9b10SDr. David Alan Gilbert     attr.qp_type = IBV_QPT_RC;
1121329c9b10SDr. David Alan Gilbert 
11228f5a7faaSJuan Quintela     if (rdma_create_qp(rdma->cm_id, rdma->pd, &attr) < 0) {
1123329c9b10SDr. David Alan Gilbert         return -1;
1124329c9b10SDr. David Alan Gilbert     }
1125329c9b10SDr. David Alan Gilbert 
1126329c9b10SDr. David Alan Gilbert     rdma->qp = rdma->cm_id->qp;
1127329c9b10SDr. David Alan Gilbert     return 0;
1128329c9b10SDr. David Alan Gilbert }
1129329c9b10SDr. David Alan Gilbert 
1130e2daccb0SLi Zhijian /* Check whether On-Demand Paging is supported by RDAM device */
rdma_support_odp(struct ibv_context * dev)1131e2daccb0SLi Zhijian static bool rdma_support_odp(struct ibv_context *dev)
1132e2daccb0SLi Zhijian {
1133e2daccb0SLi Zhijian     struct ibv_device_attr_ex attr = {0};
11348f5a7faaSJuan Quintela 
11358f5a7faaSJuan Quintela     if (ibv_query_device_ex(dev, NULL, &attr)) {
1136e2daccb0SLi Zhijian         return false;
1137e2daccb0SLi Zhijian     }
1138e2daccb0SLi Zhijian 
1139e2daccb0SLi Zhijian     if (attr.odp_caps.general_caps & IBV_ODP_SUPPORT) {
1140e2daccb0SLi Zhijian         return true;
1141e2daccb0SLi Zhijian     }
1142e2daccb0SLi Zhijian 
1143e2daccb0SLi Zhijian     return false;
1144e2daccb0SLi Zhijian }
1145e2daccb0SLi Zhijian 
1146911965acSLi Zhijian /*
1147911965acSLi Zhijian  * ibv_advise_mr to avoid RNR NAK error as far as possible.
1148911965acSLi Zhijian  * The responder mr registering with ODP will sent RNR NAK back to
1149911965acSLi Zhijian  * the requester in the face of the page fault.
1150911965acSLi Zhijian  */
qemu_rdma_advise_prefetch_mr(struct ibv_pd * pd,uint64_t addr,uint32_t len,uint32_t lkey,const char * name,bool wr)1151911965acSLi Zhijian static void qemu_rdma_advise_prefetch_mr(struct ibv_pd *pd, uint64_t addr,
1152911965acSLi Zhijian                                          uint32_t len,  uint32_t lkey,
1153911965acSLi Zhijian                                          const char *name, bool wr)
1154911965acSLi Zhijian {
1155911965acSLi Zhijian #ifdef HAVE_IBV_ADVISE_MR
1156911965acSLi Zhijian     int ret;
1157911965acSLi Zhijian     int advice = wr ? IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE :
1158911965acSLi Zhijian                  IBV_ADVISE_MR_ADVICE_PREFETCH;
1159911965acSLi Zhijian     struct ibv_sge sg_list = {.lkey = lkey, .addr = addr, .length = len};
1160911965acSLi Zhijian 
1161911965acSLi Zhijian     ret = ibv_advise_mr(pd, advice,
1162911965acSLi Zhijian                         IBV_ADVISE_MR_FLAG_FLUSH, &sg_list, 1);
1163911965acSLi Zhijian     /* ignore the error */
11640bc26045SMarkus Armbruster     trace_qemu_rdma_advise_mr(name, len, addr, strerror(ret));
1165911965acSLi Zhijian #endif
1166911965acSLi Zhijian }
1167911965acSLi Zhijian 
qemu_rdma_reg_whole_ram_blocks(RDMAContext * rdma,Error ** errp)1168de1aa35fSMarkus Armbruster static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma, Error **errp)
1169329c9b10SDr. David Alan Gilbert {
1170329c9b10SDr. David Alan Gilbert     int i;
1171329c9b10SDr. David Alan Gilbert     RDMALocalBlocks *local = &rdma->local_ram_blocks;
1172329c9b10SDr. David Alan Gilbert 
1173329c9b10SDr. David Alan Gilbert     for (i = 0; i < local->nb_blocks; i++) {
1174e2daccb0SLi Zhijian         int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE;
1175e2daccb0SLi Zhijian 
1176329c9b10SDr. David Alan Gilbert         local->block[i].mr =
1177329c9b10SDr. David Alan Gilbert             ibv_reg_mr(rdma->pd,
1178329c9b10SDr. David Alan Gilbert                     local->block[i].local_host_addr,
1179e2daccb0SLi Zhijian                     local->block[i].length, access
1180329c9b10SDr. David Alan Gilbert                     );
11810bc26045SMarkus Armbruster         /*
11820bc26045SMarkus Armbruster          * ibv_reg_mr() is not documented to set errno.  If it does,
11830bc26045SMarkus Armbruster          * it's somebody else's doc bug.  If it doesn't, the use of
11840bc26045SMarkus Armbruster          * errno below is wrong.
11850bc26045SMarkus Armbruster          * TODO Find out whether ibv_reg_mr() sets errno.
11860bc26045SMarkus Armbruster          */
1187e2daccb0SLi Zhijian         if (!local->block[i].mr &&
1188e2daccb0SLi Zhijian             errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
1189e2daccb0SLi Zhijian                 access |= IBV_ACCESS_ON_DEMAND;
1190e2daccb0SLi Zhijian                 /* register ODP mr */
1191e2daccb0SLi Zhijian                 local->block[i].mr =
1192e2daccb0SLi Zhijian                     ibv_reg_mr(rdma->pd,
1193e2daccb0SLi Zhijian                                local->block[i].local_host_addr,
1194e2daccb0SLi Zhijian                                local->block[i].length, access);
1195e2daccb0SLi Zhijian                 trace_qemu_rdma_register_odp_mr(local->block[i].block_name);
1196911965acSLi Zhijian 
1197911965acSLi Zhijian                 if (local->block[i].mr) {
1198911965acSLi Zhijian                     qemu_rdma_advise_prefetch_mr(rdma->pd,
1199911965acSLi Zhijian                                     (uintptr_t)local->block[i].local_host_addr,
1200911965acSLi Zhijian                                     local->block[i].length,
1201911965acSLi Zhijian                                     local->block[i].mr->lkey,
1202911965acSLi Zhijian                                     local->block[i].block_name,
1203911965acSLi Zhijian                                     true);
1204911965acSLi Zhijian                 }
1205e2daccb0SLi Zhijian         }
1206e2daccb0SLi Zhijian 
1207329c9b10SDr. David Alan Gilbert         if (!local->block[i].mr) {
1208de1aa35fSMarkus Armbruster             error_setg_errno(errp, errno,
1209de1aa35fSMarkus Armbruster                              "Failed to register local dest ram block!");
1210de1aa35fSMarkus Armbruster             goto err;
1211329c9b10SDr. David Alan Gilbert         }
1212329c9b10SDr. David Alan Gilbert         rdma->total_registrations++;
1213329c9b10SDr. David Alan Gilbert     }
1214329c9b10SDr. David Alan Gilbert 
1215329c9b10SDr. David Alan Gilbert     return 0;
1216329c9b10SDr. David Alan Gilbert 
1217de1aa35fSMarkus Armbruster err:
1218329c9b10SDr. David Alan Gilbert     for (i--; i >= 0; i--) {
1219329c9b10SDr. David Alan Gilbert         ibv_dereg_mr(local->block[i].mr);
1220224f364aSLi Zhijian         local->block[i].mr = NULL;
1221329c9b10SDr. David Alan Gilbert         rdma->total_registrations--;
1222329c9b10SDr. David Alan Gilbert     }
1223329c9b10SDr. David Alan Gilbert 
1224329c9b10SDr. David Alan Gilbert     return -1;
1225329c9b10SDr. David Alan Gilbert 
1226329c9b10SDr. David Alan Gilbert }
1227329c9b10SDr. David Alan Gilbert 
1228329c9b10SDr. David Alan Gilbert /*
1229329c9b10SDr. David Alan Gilbert  * Find the ram block that corresponds to the page requested to be
1230329c9b10SDr. David Alan Gilbert  * transmitted by QEMU.
1231329c9b10SDr. David Alan Gilbert  *
1232329c9b10SDr. David Alan Gilbert  * Once the block is found, also identify which 'chunk' within that
1233329c9b10SDr. David Alan Gilbert  * block that the page belongs to.
1234329c9b10SDr. David Alan Gilbert  */
qemu_rdma_search_ram_block(RDMAContext * rdma,uintptr_t block_offset,uint64_t offset,uint64_t length,uint64_t * block_index,uint64_t * chunk_index)123587e6bdabSMarkus Armbruster static void qemu_rdma_search_ram_block(RDMAContext *rdma,
1236fbce8c25SStefan Weil                                        uintptr_t block_offset,
1237329c9b10SDr. David Alan Gilbert                                        uint64_t offset,
1238329c9b10SDr. David Alan Gilbert                                        uint64_t length,
1239329c9b10SDr. David Alan Gilbert                                        uint64_t *block_index,
1240329c9b10SDr. David Alan Gilbert                                        uint64_t *chunk_index)
1241329c9b10SDr. David Alan Gilbert {
1242329c9b10SDr. David Alan Gilbert     uint64_t current_addr = block_offset + offset;
1243329c9b10SDr. David Alan Gilbert     RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap,
1244329c9b10SDr. David Alan Gilbert                                                 (void *) block_offset);
1245329c9b10SDr. David Alan Gilbert     assert(block);
1246329c9b10SDr. David Alan Gilbert     assert(current_addr >= block->offset);
1247329c9b10SDr. David Alan Gilbert     assert((current_addr + length) <= (block->offset + block->length));
1248329c9b10SDr. David Alan Gilbert 
1249329c9b10SDr. David Alan Gilbert     *block_index = block->index;
1250329c9b10SDr. David Alan Gilbert     *chunk_index = ram_chunk_index(block->local_host_addr,
1251329c9b10SDr. David Alan Gilbert                 block->local_host_addr + (current_addr - block->offset));
1252329c9b10SDr. David Alan Gilbert }
1253329c9b10SDr. David Alan Gilbert 
1254329c9b10SDr. David Alan Gilbert /*
1255329c9b10SDr. David Alan Gilbert  * Register a chunk with IB. If the chunk was already registered
1256329c9b10SDr. David Alan Gilbert  * previously, then skip.
1257329c9b10SDr. David Alan Gilbert  *
1258329c9b10SDr. David Alan Gilbert  * Also return the keys associated with the registration needed
1259329c9b10SDr. David Alan Gilbert  * to perform the actual RDMA operation.
1260329c9b10SDr. David Alan Gilbert  */
qemu_rdma_register_and_get_keys(RDMAContext * rdma,RDMALocalBlock * block,uintptr_t host_addr,uint32_t * lkey,uint32_t * rkey,int chunk,uint8_t * chunk_start,uint8_t * chunk_end)1261329c9b10SDr. David Alan Gilbert static int qemu_rdma_register_and_get_keys(RDMAContext *rdma,
12623ac040c0SStefan Weil         RDMALocalBlock *block, uintptr_t host_addr,
1263329c9b10SDr. David Alan Gilbert         uint32_t *lkey, uint32_t *rkey, int chunk,
1264329c9b10SDr. David Alan Gilbert         uint8_t *chunk_start, uint8_t *chunk_end)
1265329c9b10SDr. David Alan Gilbert {
1266329c9b10SDr. David Alan Gilbert     if (block->mr) {
1267329c9b10SDr. David Alan Gilbert         if (lkey) {
1268329c9b10SDr. David Alan Gilbert             *lkey = block->mr->lkey;
1269329c9b10SDr. David Alan Gilbert         }
1270329c9b10SDr. David Alan Gilbert         if (rkey) {
1271329c9b10SDr. David Alan Gilbert             *rkey = block->mr->rkey;
1272329c9b10SDr. David Alan Gilbert         }
1273329c9b10SDr. David Alan Gilbert         return 0;
1274329c9b10SDr. David Alan Gilbert     }
1275329c9b10SDr. David Alan Gilbert 
1276329c9b10SDr. David Alan Gilbert     /* allocate memory to store chunk MRs */
1277329c9b10SDr. David Alan Gilbert     if (!block->pmr) {
127897f3ad35SMarkus Armbruster         block->pmr = g_new0(struct ibv_mr *, block->nb_chunks);
1279329c9b10SDr. David Alan Gilbert     }
1280329c9b10SDr. David Alan Gilbert 
1281329c9b10SDr. David Alan Gilbert     /*
1282329c9b10SDr. David Alan Gilbert      * If 'rkey', then we're the destination, so grant access to the source.
1283329c9b10SDr. David Alan Gilbert      *
1284329c9b10SDr. David Alan Gilbert      * If 'lkey', then we're the source VM, so grant access only to ourselves.
1285329c9b10SDr. David Alan Gilbert      */
1286329c9b10SDr. David Alan Gilbert     if (!block->pmr[chunk]) {
1287329c9b10SDr. David Alan Gilbert         uint64_t len = chunk_end - chunk_start;
1288e2daccb0SLi Zhijian         int access = rkey ? IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE :
1289e2daccb0SLi Zhijian                      0;
1290329c9b10SDr. David Alan Gilbert 
1291733252deSDr. David Alan Gilbert         trace_qemu_rdma_register_and_get_keys(len, chunk_start);
1292329c9b10SDr. David Alan Gilbert 
1293e2daccb0SLi Zhijian         block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
12940bc26045SMarkus Armbruster         /*
12950bc26045SMarkus Armbruster          * ibv_reg_mr() is not documented to set errno.  If it does,
12960bc26045SMarkus Armbruster          * it's somebody else's doc bug.  If it doesn't, the use of
12970bc26045SMarkus Armbruster          * errno below is wrong.
12980bc26045SMarkus Armbruster          * TODO Find out whether ibv_reg_mr() sets errno.
12990bc26045SMarkus Armbruster          */
1300e2daccb0SLi Zhijian         if (!block->pmr[chunk] &&
1301e2daccb0SLi Zhijian             errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
1302e2daccb0SLi Zhijian             access |= IBV_ACCESS_ON_DEMAND;
1303e2daccb0SLi Zhijian             /* register ODP mr */
1304e2daccb0SLi Zhijian             block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
1305e2daccb0SLi Zhijian             trace_qemu_rdma_register_odp_mr(block->block_name);
1306911965acSLi Zhijian 
1307911965acSLi Zhijian             if (block->pmr[chunk]) {
1308911965acSLi Zhijian                 qemu_rdma_advise_prefetch_mr(rdma->pd, (uintptr_t)chunk_start,
1309911965acSLi Zhijian                                             len, block->pmr[chunk]->lkey,
1310911965acSLi Zhijian                                             block->block_name, rkey);
1311911965acSLi Zhijian 
1312911965acSLi Zhijian             }
1313e2daccb0SLi Zhijian         }
1314e2daccb0SLi Zhijian     }
1315329c9b10SDr. David Alan Gilbert     if (!block->pmr[chunk]) {
1316329c9b10SDr. David Alan Gilbert         return -1;
1317329c9b10SDr. David Alan Gilbert     }
1318329c9b10SDr. David Alan Gilbert     rdma->total_registrations++;
1319329c9b10SDr. David Alan Gilbert 
1320329c9b10SDr. David Alan Gilbert     if (lkey) {
1321329c9b10SDr. David Alan Gilbert         *lkey = block->pmr[chunk]->lkey;
1322329c9b10SDr. David Alan Gilbert     }
1323329c9b10SDr. David Alan Gilbert     if (rkey) {
1324329c9b10SDr. David Alan Gilbert         *rkey = block->pmr[chunk]->rkey;
1325329c9b10SDr. David Alan Gilbert     }
1326329c9b10SDr. David Alan Gilbert     return 0;
1327329c9b10SDr. David Alan Gilbert }
1328329c9b10SDr. David Alan Gilbert 
1329329c9b10SDr. David Alan Gilbert /*
1330329c9b10SDr. David Alan Gilbert  * Register (at connection time) the memory used for control
1331329c9b10SDr. David Alan Gilbert  * channel messages.
1332329c9b10SDr. David Alan Gilbert  */
qemu_rdma_reg_control(RDMAContext * rdma,int idx)1333329c9b10SDr. David Alan Gilbert static int qemu_rdma_reg_control(RDMAContext *rdma, int idx)
1334329c9b10SDr. David Alan Gilbert {
1335329c9b10SDr. David Alan Gilbert     rdma->wr_data[idx].control_mr = ibv_reg_mr(rdma->pd,
1336329c9b10SDr. David Alan Gilbert             rdma->wr_data[idx].control, RDMA_CONTROL_MAX_BUFFER,
1337329c9b10SDr. David Alan Gilbert             IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
1338329c9b10SDr. David Alan Gilbert     if (rdma->wr_data[idx].control_mr) {
1339329c9b10SDr. David Alan Gilbert         rdma->total_registrations++;
1340329c9b10SDr. David Alan Gilbert         return 0;
1341329c9b10SDr. David Alan Gilbert     }
1342329c9b10SDr. David Alan Gilbert     return -1;
1343329c9b10SDr. David Alan Gilbert }
1344329c9b10SDr. David Alan Gilbert 
1345329c9b10SDr. David Alan Gilbert /*
1346329c9b10SDr. David Alan Gilbert  * Perform a non-optimized memory unregistration after every transfer
134724ec68efSDr. David Alan Gilbert  * for demonstration purposes, only if pin-all is not requested.
1348329c9b10SDr. David Alan Gilbert  *
1349329c9b10SDr. David Alan Gilbert  * Potential optimizations:
1350329c9b10SDr. David Alan Gilbert  * 1. Start a new thread to run this function continuously
1351329c9b10SDr. David Alan Gilbert         - for bit clearing
1352329c9b10SDr. David Alan Gilbert         - and for receipt of unregister messages
1353329c9b10SDr. David Alan Gilbert  * 2. Use an LRU.
1354329c9b10SDr. David Alan Gilbert  * 3. Use workload hints.
1355329c9b10SDr. David Alan Gilbert  */
qemu_rdma_unregister_waiting(RDMAContext * rdma)1356329c9b10SDr. David Alan Gilbert static int qemu_rdma_unregister_waiting(RDMAContext *rdma)
1357329c9b10SDr. David Alan Gilbert {
1358c4c78dceSMarkus Armbruster     Error *err = NULL;
1359c4c78dceSMarkus Armbruster 
1360329c9b10SDr. David Alan Gilbert     while (rdma->unregistrations[rdma->unregister_current]) {
1361329c9b10SDr. David Alan Gilbert         int ret;
1362329c9b10SDr. David Alan Gilbert         uint64_t wr_id = rdma->unregistrations[rdma->unregister_current];
1363329c9b10SDr. David Alan Gilbert         uint64_t chunk =
1364329c9b10SDr. David Alan Gilbert             (wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
1365329c9b10SDr. David Alan Gilbert         uint64_t index =
1366329c9b10SDr. David Alan Gilbert             (wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
1367329c9b10SDr. David Alan Gilbert         RDMALocalBlock *block =
1368329c9b10SDr. David Alan Gilbert             &(rdma->local_ram_blocks.block[index]);
1369329c9b10SDr. David Alan Gilbert         RDMARegister reg = { .current_index = index };
1370329c9b10SDr. David Alan Gilbert         RDMAControlHeader resp = { .type = RDMA_CONTROL_UNREGISTER_FINISHED,
1371329c9b10SDr. David Alan Gilbert                                  };
1372329c9b10SDr. David Alan Gilbert         RDMAControlHeader head = { .len = sizeof(RDMARegister),
1373329c9b10SDr. David Alan Gilbert                                    .type = RDMA_CONTROL_UNREGISTER_REQUEST,
1374329c9b10SDr. David Alan Gilbert                                    .repeat = 1,
1375329c9b10SDr. David Alan Gilbert                                  };
1376329c9b10SDr. David Alan Gilbert 
1377733252deSDr. David Alan Gilbert         trace_qemu_rdma_unregister_waiting_proc(chunk,
1378733252deSDr. David Alan Gilbert                                                 rdma->unregister_current);
1379329c9b10SDr. David Alan Gilbert 
1380329c9b10SDr. David Alan Gilbert         rdma->unregistrations[rdma->unregister_current] = 0;
1381329c9b10SDr. David Alan Gilbert         rdma->unregister_current++;
1382329c9b10SDr. David Alan Gilbert 
1383329c9b10SDr. David Alan Gilbert         if (rdma->unregister_current == RDMA_SIGNALED_SEND_MAX) {
1384329c9b10SDr. David Alan Gilbert             rdma->unregister_current = 0;
1385329c9b10SDr. David Alan Gilbert         }
1386329c9b10SDr. David Alan Gilbert 
1387329c9b10SDr. David Alan Gilbert 
1388329c9b10SDr. David Alan Gilbert         /*
1389329c9b10SDr. David Alan Gilbert          * Unregistration is speculative (because migration is single-threaded
1390329c9b10SDr. David Alan Gilbert          * and we cannot break the protocol's inifinband message ordering).
1391329c9b10SDr. David Alan Gilbert          * Thus, if the memory is currently being used for transmission,
1392329c9b10SDr. David Alan Gilbert          * then abort the attempt to unregister and try again
1393329c9b10SDr. David Alan Gilbert          * later the next time a completion is received for this memory.
1394329c9b10SDr. David Alan Gilbert          */
1395329c9b10SDr. David Alan Gilbert         clear_bit(chunk, block->unregister_bitmap);
1396329c9b10SDr. David Alan Gilbert 
1397329c9b10SDr. David Alan Gilbert         if (test_bit(chunk, block->transit_bitmap)) {
1398733252deSDr. David Alan Gilbert             trace_qemu_rdma_unregister_waiting_inflight(chunk);
1399329c9b10SDr. David Alan Gilbert             continue;
1400329c9b10SDr. David Alan Gilbert         }
1401329c9b10SDr. David Alan Gilbert 
1402733252deSDr. David Alan Gilbert         trace_qemu_rdma_unregister_waiting_send(chunk);
1403329c9b10SDr. David Alan Gilbert 
1404329c9b10SDr. David Alan Gilbert         ret = ibv_dereg_mr(block->pmr[chunk]);
1405329c9b10SDr. David Alan Gilbert         block->pmr[chunk] = NULL;
1406329c9b10SDr. David Alan Gilbert         block->remote_keys[chunk] = 0;
1407329c9b10SDr. David Alan Gilbert 
1408329c9b10SDr. David Alan Gilbert         if (ret != 0) {
1409ff4c9194SMarkus Armbruster             error_report("unregistration chunk failed: %s",
1410ff4c9194SMarkus Armbruster                          strerror(ret));
14118c6513f7SMarkus Armbruster             return -1;
1412329c9b10SDr. David Alan Gilbert         }
1413329c9b10SDr. David Alan Gilbert         rdma->total_registrations--;
1414329c9b10SDr. David Alan Gilbert 
1415329c9b10SDr. David Alan Gilbert         reg.key.chunk = chunk;
1416b12f7777SDr. David Alan Gilbert         register_to_network(rdma, &reg);
1417329c9b10SDr. David Alan Gilbert         ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) &reg,
1418c4c78dceSMarkus Armbruster                                       &resp, NULL, NULL, &err);
1419329c9b10SDr. David Alan Gilbert         if (ret < 0) {
1420c4c78dceSMarkus Armbruster             error_report_err(err);
1421ec486974SMarkus Armbruster             return -1;
1422329c9b10SDr. David Alan Gilbert         }
1423329c9b10SDr. David Alan Gilbert 
1424733252deSDr. David Alan Gilbert         trace_qemu_rdma_unregister_waiting_complete(chunk);
1425329c9b10SDr. David Alan Gilbert     }
1426329c9b10SDr. David Alan Gilbert 
1427329c9b10SDr. David Alan Gilbert     return 0;
1428329c9b10SDr. David Alan Gilbert }
1429329c9b10SDr. David Alan Gilbert 
qemu_rdma_make_wrid(uint64_t wr_id,uint64_t index,uint64_t chunk)1430329c9b10SDr. David Alan Gilbert static uint64_t qemu_rdma_make_wrid(uint64_t wr_id, uint64_t index,
1431329c9b10SDr. David Alan Gilbert                                          uint64_t chunk)
1432329c9b10SDr. David Alan Gilbert {
1433329c9b10SDr. David Alan Gilbert     uint64_t result = wr_id & RDMA_WRID_TYPE_MASK;
1434329c9b10SDr. David Alan Gilbert 
1435329c9b10SDr. David Alan Gilbert     result |= (index << RDMA_WRID_BLOCK_SHIFT);
1436329c9b10SDr. David Alan Gilbert     result |= (chunk << RDMA_WRID_CHUNK_SHIFT);
1437329c9b10SDr. David Alan Gilbert 
1438329c9b10SDr. David Alan Gilbert     return result;
1439329c9b10SDr. David Alan Gilbert }
1440329c9b10SDr. David Alan Gilbert 
1441329c9b10SDr. David Alan Gilbert /*
1442329c9b10SDr. David Alan Gilbert  * Consult the connection manager to see a work request
1443329c9b10SDr. David Alan Gilbert  * (of any kind) has completed.
1444329c9b10SDr. David Alan Gilbert  * Return the work request ID that completed.
1445329c9b10SDr. David Alan Gilbert  */
qemu_rdma_poll(RDMAContext * rdma,struct ibv_cq * cq,uint64_t * wr_id_out,uint32_t * byte_len)1446b72eacf3SMarkus Armbruster static int qemu_rdma_poll(RDMAContext *rdma, struct ibv_cq *cq,
1447b390afd8SLi Zhijian                           uint64_t *wr_id_out, uint32_t *byte_len)
1448329c9b10SDr. David Alan Gilbert {
1449329c9b10SDr. David Alan Gilbert     int ret;
1450329c9b10SDr. David Alan Gilbert     struct ibv_wc wc;
1451329c9b10SDr. David Alan Gilbert     uint64_t wr_id;
1452329c9b10SDr. David Alan Gilbert 
1453b390afd8SLi Zhijian     ret = ibv_poll_cq(cq, 1, &wc);
1454329c9b10SDr. David Alan Gilbert 
1455329c9b10SDr. David Alan Gilbert     if (!ret) {
1456329c9b10SDr. David Alan Gilbert         *wr_id_out = RDMA_WRID_NONE;
1457329c9b10SDr. David Alan Gilbert         return 0;
1458329c9b10SDr. David Alan Gilbert     }
1459329c9b10SDr. David Alan Gilbert 
1460329c9b10SDr. David Alan Gilbert     if (ret < 0) {
1461ec486974SMarkus Armbruster         return -1;
1462329c9b10SDr. David Alan Gilbert     }
1463329c9b10SDr. David Alan Gilbert 
1464329c9b10SDr. David Alan Gilbert     wr_id = wc.wr_id & RDMA_WRID_TYPE_MASK;
1465329c9b10SDr. David Alan Gilbert 
1466329c9b10SDr. David Alan Gilbert     if (wc.status != IBV_WC_SUCCESS) {
1467329c9b10SDr. David Alan Gilbert         return -1;
1468329c9b10SDr. David Alan Gilbert     }
1469329c9b10SDr. David Alan Gilbert 
1470329c9b10SDr. David Alan Gilbert     if (rdma->control_ready_expected &&
1471329c9b10SDr. David Alan Gilbert         (wr_id >= RDMA_WRID_RECV_CONTROL)) {
1472b5631d5bSMarkus Armbruster         trace_qemu_rdma_poll_recv(wr_id - RDMA_WRID_RECV_CONTROL, wr_id,
1473b5631d5bSMarkus Armbruster                                   rdma->nb_sent);
1474329c9b10SDr. David Alan Gilbert         rdma->control_ready_expected = 0;
1475329c9b10SDr. David Alan Gilbert     }
1476329c9b10SDr. David Alan Gilbert 
1477329c9b10SDr. David Alan Gilbert     if (wr_id == RDMA_WRID_RDMA_WRITE) {
1478329c9b10SDr. David Alan Gilbert         uint64_t chunk =
1479329c9b10SDr. David Alan Gilbert             (wc.wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
1480329c9b10SDr. David Alan Gilbert         uint64_t index =
1481329c9b10SDr. David Alan Gilbert             (wc.wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
1482329c9b10SDr. David Alan Gilbert         RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
1483329c9b10SDr. David Alan Gilbert 
1484b5631d5bSMarkus Armbruster         trace_qemu_rdma_poll_write(wr_id, rdma->nb_sent,
1485fbce8c25SStefan Weil                                    index, chunk, block->local_host_addr,
1486fbce8c25SStefan Weil                                    (void *)(uintptr_t)block->remote_host_addr);
1487329c9b10SDr. David Alan Gilbert 
1488329c9b10SDr. David Alan Gilbert         clear_bit(chunk, block->transit_bitmap);
1489329c9b10SDr. David Alan Gilbert 
1490329c9b10SDr. David Alan Gilbert         if (rdma->nb_sent > 0) {
1491329c9b10SDr. David Alan Gilbert             rdma->nb_sent--;
1492329c9b10SDr. David Alan Gilbert         }
1493329c9b10SDr. David Alan Gilbert     } else {
1494b5631d5bSMarkus Armbruster         trace_qemu_rdma_poll_other(wr_id, rdma->nb_sent);
1495329c9b10SDr. David Alan Gilbert     }
1496329c9b10SDr. David Alan Gilbert 
1497329c9b10SDr. David Alan Gilbert     *wr_id_out = wc.wr_id;
1498329c9b10SDr. David Alan Gilbert     if (byte_len) {
1499329c9b10SDr. David Alan Gilbert         *byte_len = wc.byte_len;
1500329c9b10SDr. David Alan Gilbert     }
1501329c9b10SDr. David Alan Gilbert 
1502329c9b10SDr. David Alan Gilbert     return  0;
1503329c9b10SDr. David Alan Gilbert }
1504329c9b10SDr. David Alan Gilbert 
15059c98cfbeSDr. David Alan Gilbert /* Wait for activity on the completion channel.
15069c98cfbeSDr. David Alan Gilbert  * Returns 0 on success, none-0 on error.
15079c98cfbeSDr. David Alan Gilbert  */
qemu_rdma_wait_comp_channel(RDMAContext * rdma,struct ibv_comp_channel * comp_channel)1508b390afd8SLi Zhijian static int qemu_rdma_wait_comp_channel(RDMAContext *rdma,
1509b390afd8SLi Zhijian                                        struct ibv_comp_channel *comp_channel)
15109c98cfbeSDr. David Alan Gilbert {
1511d5882995SLidong Chen     struct rdma_cm_event *cm_event;
1512d5882995SLidong Chen 
15139c98cfbeSDr. David Alan Gilbert     /*
15149c98cfbeSDr. David Alan Gilbert      * Coroutine doesn't start until migration_fd_process_incoming()
15159c98cfbeSDr. David Alan Gilbert      * so don't yield unless we know we're running inside of a coroutine.
15169c98cfbeSDr. David Alan Gilbert      */
1517f5627c2aSLidong Chen     if (rdma->migration_started_on_destination &&
1518f5627c2aSLidong Chen         migration_incoming_get_current()->state == MIGRATION_STATUS_ACTIVE) {
1519b390afd8SLi Zhijian         yield_until_fd_readable(comp_channel->fd);
15209c98cfbeSDr. David Alan Gilbert     } else {
15219c98cfbeSDr. David Alan Gilbert         /* This is the source side, we're in a separate thread
15229c98cfbeSDr. David Alan Gilbert          * or destination prior to migration_fd_process_incoming()
15233a4452d8Szhaolichang          * after postcopy, the destination also in a separate thread.
15249c98cfbeSDr. David Alan Gilbert          * we can't yield; so we have to poll the fd.
15259c98cfbeSDr. David Alan Gilbert          * But we need to be able to handle 'cancel' or an error
15269c98cfbeSDr. David Alan Gilbert          * without hanging forever.
15279c98cfbeSDr. David Alan Gilbert          */
1528b86c94a4SMarkus Armbruster         while (!rdma->errored && !rdma->received_error) {
1529d5882995SLidong Chen             GPollFD pfds[2];
1530b390afd8SLi Zhijian             pfds[0].fd = comp_channel->fd;
15319c98cfbeSDr. David Alan Gilbert             pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
1532d5882995SLidong Chen             pfds[0].revents = 0;
1533d5882995SLidong Chen 
1534d5882995SLidong Chen             pfds[1].fd = rdma->channel->fd;
1535d5882995SLidong Chen             pfds[1].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
1536d5882995SLidong Chen             pfds[1].revents = 0;
1537d5882995SLidong Chen 
15389c98cfbeSDr. David Alan Gilbert             /* 0.1s timeout, should be fine for a 'cancel' */
1539d5882995SLidong Chen             switch (qemu_poll_ns(pfds, 2, 100 * 1000 * 1000)) {
1540d5882995SLidong Chen             case 2:
15419c98cfbeSDr. David Alan Gilbert             case 1: /* fd active */
1542d5882995SLidong Chen                 if (pfds[0].revents) {
15439c98cfbeSDr. David Alan Gilbert                     return 0;
1544d5882995SLidong Chen                 }
1545d5882995SLidong Chen 
1546d5882995SLidong Chen                 if (pfds[1].revents) {
15478f5a7faaSJuan Quintela                     if (rdma_get_cm_event(rdma->channel, &cm_event) < 0) {
15488c6513f7SMarkus Armbruster                         return -1;
1549d5882995SLidong Chen                     }
1550d5882995SLidong Chen 
1551d5882995SLidong Chen                     if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
1552d5882995SLidong Chen                         cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
15536b8c2eb5SLi Zhijian                         rdma_ack_cm_event(cm_event);
15548c6513f7SMarkus Armbruster                         return -1;
1555d5882995SLidong Chen                     }
15566b8c2eb5SLi Zhijian                     rdma_ack_cm_event(cm_event);
1557d5882995SLidong Chen                 }
1558d5882995SLidong Chen                 break;
15599c98cfbeSDr. David Alan Gilbert 
15609c98cfbeSDr. David Alan Gilbert             case 0: /* Timeout, go around again */
15619c98cfbeSDr. David Alan Gilbert                 break;
15629c98cfbeSDr. David Alan Gilbert 
15639c98cfbeSDr. David Alan Gilbert             default: /* Error of some type -
15649c98cfbeSDr. David Alan Gilbert                       * I don't trust errno from qemu_poll_ns
15659c98cfbeSDr. David Alan Gilbert                      */
15668c6513f7SMarkus Armbruster                 return -1;
15679c98cfbeSDr. David Alan Gilbert             }
15689c98cfbeSDr. David Alan Gilbert 
15699c98cfbeSDr. David Alan Gilbert             if (migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) {
15709c98cfbeSDr. David Alan Gilbert                 /* Bail out and let the cancellation happen */
15718c6513f7SMarkus Armbruster                 return -1;
15729c98cfbeSDr. David Alan Gilbert             }
15739c98cfbeSDr. David Alan Gilbert         }
15749c98cfbeSDr. David Alan Gilbert     }
15759c98cfbeSDr. David Alan Gilbert 
15769c98cfbeSDr. David Alan Gilbert     if (rdma->received_error) {
15778c6513f7SMarkus Armbruster         return -1;
15789c98cfbeSDr. David Alan Gilbert     }
1579b86c94a4SMarkus Armbruster     return -rdma->errored;
15809c98cfbeSDr. David Alan Gilbert }
15819c98cfbeSDr. David Alan Gilbert 
to_channel(RDMAContext * rdma,uint64_t wrid)158287a24ca3SMarkus Armbruster static struct ibv_comp_channel *to_channel(RDMAContext *rdma, uint64_t wrid)
1583b390afd8SLi Zhijian {
1584b390afd8SLi Zhijian     return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_comp_channel :
1585b390afd8SLi Zhijian            rdma->recv_comp_channel;
1586b390afd8SLi Zhijian }
1587b390afd8SLi Zhijian 
to_cq(RDMAContext * rdma,uint64_t wrid)158887a24ca3SMarkus Armbruster static struct ibv_cq *to_cq(RDMAContext *rdma, uint64_t wrid)
1589b390afd8SLi Zhijian {
1590b390afd8SLi Zhijian     return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_cq : rdma->recv_cq;
1591b390afd8SLi Zhijian }
1592b390afd8SLi Zhijian 
1593329c9b10SDr. David Alan Gilbert /*
1594329c9b10SDr. David Alan Gilbert  * Block until the next work request has completed.
1595329c9b10SDr. David Alan Gilbert  *
1596329c9b10SDr. David Alan Gilbert  * First poll to see if a work request has already completed,
1597329c9b10SDr. David Alan Gilbert  * otherwise block.
1598329c9b10SDr. David Alan Gilbert  *
1599329c9b10SDr. David Alan Gilbert  * If we encounter completed work requests for IDs other than
1600329c9b10SDr. David Alan Gilbert  * the one we're interested in, then that's generally an error.
1601329c9b10SDr. David Alan Gilbert  *
1602329c9b10SDr. David Alan Gilbert  * The only exception is actual RDMA Write completions. These
1603329c9b10SDr. David Alan Gilbert  * completions only need to be recorded, but do not actually
1604329c9b10SDr. David Alan Gilbert  * need further processing.
1605329c9b10SDr. David Alan Gilbert  */
qemu_rdma_block_for_wrid(RDMAContext * rdma,uint64_t wrid_requested,uint32_t * byte_len)160687a24ca3SMarkus Armbruster static int qemu_rdma_block_for_wrid(RDMAContext *rdma,
160787a24ca3SMarkus Armbruster                                     uint64_t wrid_requested,
1608329c9b10SDr. David Alan Gilbert                                     uint32_t *byte_len)
1609329c9b10SDr. David Alan Gilbert {
1610c0d77702SMarkus Armbruster     int num_cq_events = 0, ret;
1611329c9b10SDr. David Alan Gilbert     struct ibv_cq *cq;
1612329c9b10SDr. David Alan Gilbert     void *cq_ctx;
1613329c9b10SDr. David Alan Gilbert     uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
1614b390afd8SLi Zhijian     struct ibv_comp_channel *ch = to_channel(rdma, wrid_requested);
1615b390afd8SLi Zhijian     struct ibv_cq *poll_cq = to_cq(rdma, wrid_requested);
1616329c9b10SDr. David Alan Gilbert 
1617b390afd8SLi Zhijian     if (ibv_req_notify_cq(poll_cq, 0)) {
1618329c9b10SDr. David Alan Gilbert         return -1;
1619329c9b10SDr. David Alan Gilbert     }
1620329c9b10SDr. David Alan Gilbert     /* poll cq first */
1621329c9b10SDr. David Alan Gilbert     while (wr_id != wrid_requested) {
1622b390afd8SLi Zhijian         ret = qemu_rdma_poll(rdma, poll_cq, &wr_id_in, byte_len);
1623329c9b10SDr. David Alan Gilbert         if (ret < 0) {
1624ec486974SMarkus Armbruster             return -1;
1625329c9b10SDr. David Alan Gilbert         }
1626329c9b10SDr. David Alan Gilbert 
1627329c9b10SDr. David Alan Gilbert         wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
1628329c9b10SDr. David Alan Gilbert 
1629329c9b10SDr. David Alan Gilbert         if (wr_id == RDMA_WRID_NONE) {
1630329c9b10SDr. David Alan Gilbert             break;
1631329c9b10SDr. David Alan Gilbert         }
1632329c9b10SDr. David Alan Gilbert         if (wr_id != wrid_requested) {
1633b5631d5bSMarkus Armbruster             trace_qemu_rdma_block_for_wrid_miss(wrid_requested, wr_id);
1634329c9b10SDr. David Alan Gilbert         }
1635329c9b10SDr. David Alan Gilbert     }
1636329c9b10SDr. David Alan Gilbert 
1637329c9b10SDr. David Alan Gilbert     if (wr_id == wrid_requested) {
1638329c9b10SDr. David Alan Gilbert         return 0;
1639329c9b10SDr. David Alan Gilbert     }
1640329c9b10SDr. David Alan Gilbert 
1641329c9b10SDr. David Alan Gilbert     while (1) {
1642b390afd8SLi Zhijian         ret = qemu_rdma_wait_comp_channel(rdma, ch);
16434a102179SMarkus Armbruster         if (ret < 0) {
16449c98cfbeSDr. David Alan Gilbert             goto err_block_for_wrid;
1645329c9b10SDr. David Alan Gilbert         }
1646329c9b10SDr. David Alan Gilbert 
1647b390afd8SLi Zhijian         ret = ibv_get_cq_event(ch, &cq, &cq_ctx);
16484a102179SMarkus Armbruster         if (ret < 0) {
1649329c9b10SDr. David Alan Gilbert             goto err_block_for_wrid;
1650329c9b10SDr. David Alan Gilbert         }
1651329c9b10SDr. David Alan Gilbert 
1652329c9b10SDr. David Alan Gilbert         num_cq_events++;
1653329c9b10SDr. David Alan Gilbert 
1654c0d77702SMarkus Armbruster         if (ibv_req_notify_cq(cq, 0)) {
1655329c9b10SDr. David Alan Gilbert             goto err_block_for_wrid;
1656329c9b10SDr. David Alan Gilbert         }
1657329c9b10SDr. David Alan Gilbert 
1658329c9b10SDr. David Alan Gilbert         while (wr_id != wrid_requested) {
1659b390afd8SLi Zhijian             ret = qemu_rdma_poll(rdma, poll_cq, &wr_id_in, byte_len);
1660329c9b10SDr. David Alan Gilbert             if (ret < 0) {
1661329c9b10SDr. David Alan Gilbert                 goto err_block_for_wrid;
1662329c9b10SDr. David Alan Gilbert             }
1663329c9b10SDr. David Alan Gilbert 
1664329c9b10SDr. David Alan Gilbert             wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
1665329c9b10SDr. David Alan Gilbert 
1666329c9b10SDr. David Alan Gilbert             if (wr_id == RDMA_WRID_NONE) {
1667329c9b10SDr. David Alan Gilbert                 break;
1668329c9b10SDr. David Alan Gilbert             }
1669329c9b10SDr. David Alan Gilbert             if (wr_id != wrid_requested) {
1670b5631d5bSMarkus Armbruster                 trace_qemu_rdma_block_for_wrid_miss(wrid_requested, wr_id);
1671329c9b10SDr. David Alan Gilbert             }
1672329c9b10SDr. David Alan Gilbert         }
1673329c9b10SDr. David Alan Gilbert 
1674329c9b10SDr. David Alan Gilbert         if (wr_id == wrid_requested) {
1675329c9b10SDr. David Alan Gilbert             goto success_block_for_wrid;
1676329c9b10SDr. David Alan Gilbert         }
1677329c9b10SDr. David Alan Gilbert     }
1678329c9b10SDr. David Alan Gilbert 
1679329c9b10SDr. David Alan Gilbert success_block_for_wrid:
1680329c9b10SDr. David Alan Gilbert     if (num_cq_events) {
1681329c9b10SDr. David Alan Gilbert         ibv_ack_cq_events(cq, num_cq_events);
1682329c9b10SDr. David Alan Gilbert     }
1683329c9b10SDr. David Alan Gilbert     return 0;
1684329c9b10SDr. David Alan Gilbert 
1685329c9b10SDr. David Alan Gilbert err_block_for_wrid:
1686329c9b10SDr. David Alan Gilbert     if (num_cq_events) {
1687329c9b10SDr. David Alan Gilbert         ibv_ack_cq_events(cq, num_cq_events);
1688329c9b10SDr. David Alan Gilbert     }
16890b3c15f0SDr. David Alan Gilbert 
1690b86c94a4SMarkus Armbruster     rdma->errored = true;
1691ec486974SMarkus Armbruster     return -1;
1692329c9b10SDr. David Alan Gilbert }
1693329c9b10SDr. David Alan Gilbert 
1694329c9b10SDr. David Alan Gilbert /*
1695329c9b10SDr. David Alan Gilbert  * Post a SEND message work request for the control channel
1696329c9b10SDr. David Alan Gilbert  * containing some data and block until the post completes.
1697329c9b10SDr. David Alan Gilbert  */
qemu_rdma_post_send_control(RDMAContext * rdma,uint8_t * buf,RDMAControlHeader * head,Error ** errp)1698329c9b10SDr. David Alan Gilbert static int qemu_rdma_post_send_control(RDMAContext *rdma, uint8_t *buf,
1699f3805964SMarkus Armbruster                                        RDMAControlHeader *head,
1700f3805964SMarkus Armbruster                                        Error **errp)
1701329c9b10SDr. David Alan Gilbert {
1702c0d77702SMarkus Armbruster     int ret;
1703329c9b10SDr. David Alan Gilbert     RDMAWorkRequestData *wr = &rdma->wr_data[RDMA_WRID_CONTROL];
1704329c9b10SDr. David Alan Gilbert     struct ibv_send_wr *bad_wr;
1705329c9b10SDr. David Alan Gilbert     struct ibv_sge sge = {
1706fbce8c25SStefan Weil                            .addr = (uintptr_t)(wr->control),
1707329c9b10SDr. David Alan Gilbert                            .length = head->len + sizeof(RDMAControlHeader),
1708329c9b10SDr. David Alan Gilbert                            .lkey = wr->control_mr->lkey,
1709329c9b10SDr. David Alan Gilbert                          };
1710329c9b10SDr. David Alan Gilbert     struct ibv_send_wr send_wr = {
1711329c9b10SDr. David Alan Gilbert                                    .wr_id = RDMA_WRID_SEND_CONTROL,
1712329c9b10SDr. David Alan Gilbert                                    .opcode = IBV_WR_SEND,
1713329c9b10SDr. David Alan Gilbert                                    .send_flags = IBV_SEND_SIGNALED,
1714329c9b10SDr. David Alan Gilbert                                    .sg_list = &sge,
1715329c9b10SDr. David Alan Gilbert                                    .num_sge = 1,
1716329c9b10SDr. David Alan Gilbert                                 };
1717329c9b10SDr. David Alan Gilbert 
1718482a33c5SDr. David Alan Gilbert     trace_qemu_rdma_post_send_control(control_desc(head->type));
1719329c9b10SDr. David Alan Gilbert 
1720329c9b10SDr. David Alan Gilbert     /*
1721329c9b10SDr. David Alan Gilbert      * We don't actually need to do a memcpy() in here if we used
1722329c9b10SDr. David Alan Gilbert      * the "sge" properly, but since we're only sending control messages
1723329c9b10SDr. David Alan Gilbert      * (not RAM in a performance-critical path), then its OK for now.
1724329c9b10SDr. David Alan Gilbert      *
1725329c9b10SDr. David Alan Gilbert      * The copy makes the RDMAControlHeader simpler to manipulate
1726329c9b10SDr. David Alan Gilbert      * for the time being.
1727329c9b10SDr. David Alan Gilbert      */
1728329c9b10SDr. David Alan Gilbert     assert(head->len <= RDMA_CONTROL_MAX_BUFFER - sizeof(*head));
1729329c9b10SDr. David Alan Gilbert     memcpy(wr->control, head, sizeof(RDMAControlHeader));
1730329c9b10SDr. David Alan Gilbert     control_to_network((void *) wr->control);
1731329c9b10SDr. David Alan Gilbert 
1732329c9b10SDr. David Alan Gilbert     if (buf) {
1733329c9b10SDr. David Alan Gilbert         memcpy(wr->control + sizeof(RDMAControlHeader), buf, head->len);
1734329c9b10SDr. David Alan Gilbert     }
1735329c9b10SDr. David Alan Gilbert 
1736329c9b10SDr. David Alan Gilbert 
1737329c9b10SDr. David Alan Gilbert     ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
1738329c9b10SDr. David Alan Gilbert 
1739329c9b10SDr. David Alan Gilbert     if (ret > 0) {
1740f3805964SMarkus Armbruster         error_setg(errp, "Failed to use post IB SEND for control");
17418c6513f7SMarkus Armbruster         return -1;
1742329c9b10SDr. David Alan Gilbert     }
1743329c9b10SDr. David Alan Gilbert 
1744329c9b10SDr. David Alan Gilbert     ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_SEND_CONTROL, NULL);
1745329c9b10SDr. David Alan Gilbert     if (ret < 0) {
1746f3805964SMarkus Armbruster         error_setg(errp, "rdma migration: send polling control error");
1747ec486974SMarkus Armbruster         return -1;
1748329c9b10SDr. David Alan Gilbert     }
1749329c9b10SDr. David Alan Gilbert 
1750ec486974SMarkus Armbruster     return 0;
1751329c9b10SDr. David Alan Gilbert }
1752329c9b10SDr. David Alan Gilbert 
1753329c9b10SDr. David Alan Gilbert /*
1754329c9b10SDr. David Alan Gilbert  * Post a RECV work request in anticipation of some future receipt
1755329c9b10SDr. David Alan Gilbert  * of data on the control channel.
1756329c9b10SDr. David Alan Gilbert  */
qemu_rdma_post_recv_control(RDMAContext * rdma,int idx,Error ** errp)17573c0c3ebaSMarkus Armbruster static int qemu_rdma_post_recv_control(RDMAContext *rdma, int idx,
17583c0c3ebaSMarkus Armbruster                                        Error **errp)
1759329c9b10SDr. David Alan Gilbert {
1760329c9b10SDr. David Alan Gilbert     struct ibv_recv_wr *bad_wr;
1761329c9b10SDr. David Alan Gilbert     struct ibv_sge sge = {
1762fbce8c25SStefan Weil                             .addr = (uintptr_t)(rdma->wr_data[idx].control),
1763329c9b10SDr. David Alan Gilbert                             .length = RDMA_CONTROL_MAX_BUFFER,
1764329c9b10SDr. David Alan Gilbert                             .lkey = rdma->wr_data[idx].control_mr->lkey,
1765329c9b10SDr. David Alan Gilbert                          };
1766329c9b10SDr. David Alan Gilbert 
1767329c9b10SDr. David Alan Gilbert     struct ibv_recv_wr recv_wr = {
1768329c9b10SDr. David Alan Gilbert                                     .wr_id = RDMA_WRID_RECV_CONTROL + idx,
1769329c9b10SDr. David Alan Gilbert                                     .sg_list = &sge,
1770329c9b10SDr. David Alan Gilbert                                     .num_sge = 1,
1771329c9b10SDr. David Alan Gilbert                                  };
1772329c9b10SDr. David Alan Gilbert 
1773329c9b10SDr. David Alan Gilbert 
1774329c9b10SDr. David Alan Gilbert     if (ibv_post_recv(rdma->qp, &recv_wr, &bad_wr)) {
17753c0c3ebaSMarkus Armbruster         error_setg(errp, "error posting control recv");
1776329c9b10SDr. David Alan Gilbert         return -1;
1777329c9b10SDr. David Alan Gilbert     }
1778329c9b10SDr. David Alan Gilbert 
1779329c9b10SDr. David Alan Gilbert     return 0;
1780329c9b10SDr. David Alan Gilbert }
1781329c9b10SDr. David Alan Gilbert 
1782329c9b10SDr. David Alan Gilbert /*
1783329c9b10SDr. David Alan Gilbert  * Block and wait for a RECV control channel message to arrive.
1784329c9b10SDr. David Alan Gilbert  */
qemu_rdma_exchange_get_response(RDMAContext * rdma,RDMAControlHeader * head,uint32_t expecting,int idx,Error ** errp)1785329c9b10SDr. David Alan Gilbert static int qemu_rdma_exchange_get_response(RDMAContext *rdma,
17863765ec1fSMarkus Armbruster                 RDMAControlHeader *head, uint32_t expecting, int idx,
17873765ec1fSMarkus Armbruster                 Error **errp)
1788329c9b10SDr. David Alan Gilbert {
1789329c9b10SDr. David Alan Gilbert     uint32_t byte_len;
1790329c9b10SDr. David Alan Gilbert     int ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RECV_CONTROL + idx,
1791329c9b10SDr. David Alan Gilbert                                        &byte_len);
1792329c9b10SDr. David Alan Gilbert 
1793329c9b10SDr. David Alan Gilbert     if (ret < 0) {
17943765ec1fSMarkus Armbruster         error_setg(errp, "rdma migration: recv polling control error!");
1795ec486974SMarkus Armbruster         return -1;
1796329c9b10SDr. David Alan Gilbert     }
1797329c9b10SDr. David Alan Gilbert 
1798329c9b10SDr. David Alan Gilbert     network_to_control((void *) rdma->wr_data[idx].control);
1799329c9b10SDr. David Alan Gilbert     memcpy(head, rdma->wr_data[idx].control, sizeof(RDMAControlHeader));
1800329c9b10SDr. David Alan Gilbert 
1801482a33c5SDr. David Alan Gilbert     trace_qemu_rdma_exchange_get_response_start(control_desc(expecting));
1802329c9b10SDr. David Alan Gilbert 
1803329c9b10SDr. David Alan Gilbert     if (expecting == RDMA_CONTROL_NONE) {
1804482a33c5SDr. David Alan Gilbert         trace_qemu_rdma_exchange_get_response_none(control_desc(head->type),
1805733252deSDr. David Alan Gilbert                                              head->type);
1806329c9b10SDr. David Alan Gilbert     } else if (head->type != expecting || head->type == RDMA_CONTROL_ERROR) {
18073765ec1fSMarkus Armbruster         error_setg(errp, "Was expecting a %s (%d) control message"
1808733252deSDr. David Alan Gilbert                 ", but got: %s (%d), length: %d",
1809482a33c5SDr. David Alan Gilbert                 control_desc(expecting), expecting,
1810482a33c5SDr. David Alan Gilbert                 control_desc(head->type), head->type, head->len);
1811cd5ea070SDr. David Alan Gilbert         if (head->type == RDMA_CONTROL_ERROR) {
1812cd5ea070SDr. David Alan Gilbert             rdma->received_error = true;
1813cd5ea070SDr. David Alan Gilbert         }
18148c6513f7SMarkus Armbruster         return -1;
1815329c9b10SDr. David Alan Gilbert     }
1816329c9b10SDr. David Alan Gilbert     if (head->len > RDMA_CONTROL_MAX_BUFFER - sizeof(*head)) {
18173765ec1fSMarkus Armbruster         error_setg(errp, "too long length: %d", head->len);
18188c6513f7SMarkus Armbruster         return -1;
1819329c9b10SDr. David Alan Gilbert     }
1820329c9b10SDr. David Alan Gilbert     if (sizeof(*head) + head->len != byte_len) {
18213765ec1fSMarkus Armbruster         error_setg(errp, "Malformed length: %d byte_len %d",
18223765ec1fSMarkus Armbruster                    head->len, byte_len);
18238c6513f7SMarkus Armbruster         return -1;
1824329c9b10SDr. David Alan Gilbert     }
1825329c9b10SDr. David Alan Gilbert 
1826329c9b10SDr. David Alan Gilbert     return 0;
1827329c9b10SDr. David Alan Gilbert }
1828329c9b10SDr. David Alan Gilbert 
1829329c9b10SDr. David Alan Gilbert /*
1830329c9b10SDr. David Alan Gilbert  * When a RECV work request has completed, the work request's
1831329c9b10SDr. David Alan Gilbert  * buffer is pointed at the header.
1832329c9b10SDr. David Alan Gilbert  *
1833329c9b10SDr. David Alan Gilbert  * This will advance the pointer to the data portion
1834329c9b10SDr. David Alan Gilbert  * of the control message of the work request's buffer that
1835329c9b10SDr. David Alan Gilbert  * was populated after the work request finished.
1836329c9b10SDr. David Alan Gilbert  */
qemu_rdma_move_header(RDMAContext * rdma,int idx,RDMAControlHeader * head)1837329c9b10SDr. David Alan Gilbert static void qemu_rdma_move_header(RDMAContext *rdma, int idx,
1838329c9b10SDr. David Alan Gilbert                                   RDMAControlHeader *head)
1839329c9b10SDr. David Alan Gilbert {
1840329c9b10SDr. David Alan Gilbert     rdma->wr_data[idx].control_len = head->len;
1841329c9b10SDr. David Alan Gilbert     rdma->wr_data[idx].control_curr =
1842329c9b10SDr. David Alan Gilbert         rdma->wr_data[idx].control + sizeof(RDMAControlHeader);
1843329c9b10SDr. David Alan Gilbert }
1844329c9b10SDr. David Alan Gilbert 
1845329c9b10SDr. David Alan Gilbert /*
1846329c9b10SDr. David Alan Gilbert  * This is an 'atomic' high-level operation to deliver a single, unified
1847329c9b10SDr. David Alan Gilbert  * control-channel message.
1848329c9b10SDr. David Alan Gilbert  *
1849329c9b10SDr. David Alan Gilbert  * Additionally, if the user is expecting some kind of reply to this message,
1850329c9b10SDr. David Alan Gilbert  * they can request a 'resp' response message be filled in by posting an
1851329c9b10SDr. David Alan Gilbert  * additional work request on behalf of the user and waiting for an additional
1852329c9b10SDr. David Alan Gilbert  * completion.
1853329c9b10SDr. David Alan Gilbert  *
1854329c9b10SDr. David Alan Gilbert  * The extra (optional) response is used during registration to us from having
1855329c9b10SDr. David Alan Gilbert  * to perform an *additional* exchange of message just to provide a response by
1856329c9b10SDr. David Alan Gilbert  * instead piggy-backing on the acknowledgement.
1857329c9b10SDr. David Alan Gilbert  */
qemu_rdma_exchange_send(RDMAContext * rdma,RDMAControlHeader * head,uint8_t * data,RDMAControlHeader * resp,int * resp_idx,int (* callback)(RDMAContext * rdma,Error ** errp),Error ** errp)1858329c9b10SDr. David Alan Gilbert static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
1859329c9b10SDr. David Alan Gilbert                                    uint8_t *data, RDMAControlHeader *resp,
1860329c9b10SDr. David Alan Gilbert                                    int *resp_idx,
1861de1aa35fSMarkus Armbruster                                    int (*callback)(RDMAContext *rdma,
1862de1aa35fSMarkus Armbruster                                                    Error **errp),
1863c4c78dceSMarkus Armbruster                                    Error **errp)
1864329c9b10SDr. David Alan Gilbert {
1865c0d77702SMarkus Armbruster     int ret;
1866329c9b10SDr. David Alan Gilbert 
1867329c9b10SDr. David Alan Gilbert     /*
1868329c9b10SDr. David Alan Gilbert      * Wait until the dest is ready before attempting to deliver the message
1869329c9b10SDr. David Alan Gilbert      * by waiting for a READY message.
1870329c9b10SDr. David Alan Gilbert      */
1871329c9b10SDr. David Alan Gilbert     if (rdma->control_ready_expected) {
18727f3de3f0SMarkus Armbruster         RDMAControlHeader resp_ignored;
18737f3de3f0SMarkus Armbruster 
18747f3de3f0SMarkus Armbruster         ret = qemu_rdma_exchange_get_response(rdma, &resp_ignored,
18757f3de3f0SMarkus Armbruster                                               RDMA_CONTROL_READY,
18763765ec1fSMarkus Armbruster                                               RDMA_WRID_READY, errp);
1877329c9b10SDr. David Alan Gilbert         if (ret < 0) {
1878ec486974SMarkus Armbruster             return -1;
1879329c9b10SDr. David Alan Gilbert         }
1880329c9b10SDr. David Alan Gilbert     }
1881329c9b10SDr. David Alan Gilbert 
1882329c9b10SDr. David Alan Gilbert     /*
1883329c9b10SDr. David Alan Gilbert      * If the user is expecting a response, post a WR in anticipation of it.
1884329c9b10SDr. David Alan Gilbert      */
1885329c9b10SDr. David Alan Gilbert     if (resp) {
18863c0c3ebaSMarkus Armbruster         ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_DATA, errp);
18874a102179SMarkus Armbruster         if (ret < 0) {
1888ec486974SMarkus Armbruster             return -1;
1889329c9b10SDr. David Alan Gilbert         }
1890329c9b10SDr. David Alan Gilbert     }
1891329c9b10SDr. David Alan Gilbert 
1892329c9b10SDr. David Alan Gilbert     /*
1893329c9b10SDr. David Alan Gilbert      * Post a WR to replace the one we just consumed for the READY message.
1894329c9b10SDr. David Alan Gilbert      */
18953c0c3ebaSMarkus Armbruster     ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY, errp);
18964a102179SMarkus Armbruster     if (ret < 0) {
1897ec486974SMarkus Armbruster         return -1;
1898329c9b10SDr. David Alan Gilbert     }
1899329c9b10SDr. David Alan Gilbert 
1900329c9b10SDr. David Alan Gilbert     /*
1901329c9b10SDr. David Alan Gilbert      * Deliver the control message that was requested.
1902329c9b10SDr. David Alan Gilbert      */
1903f3805964SMarkus Armbruster     ret = qemu_rdma_post_send_control(rdma, data, head, errp);
1904329c9b10SDr. David Alan Gilbert 
1905329c9b10SDr. David Alan Gilbert     if (ret < 0) {
1906ec486974SMarkus Armbruster         return -1;
1907329c9b10SDr. David Alan Gilbert     }
1908329c9b10SDr. David Alan Gilbert 
1909329c9b10SDr. David Alan Gilbert     /*
1910329c9b10SDr. David Alan Gilbert      * If we're expecting a response, block and wait for it.
1911329c9b10SDr. David Alan Gilbert      */
1912329c9b10SDr. David Alan Gilbert     if (resp) {
1913329c9b10SDr. David Alan Gilbert         if (callback) {
1914733252deSDr. David Alan Gilbert             trace_qemu_rdma_exchange_send_issue_callback();
1915de1aa35fSMarkus Armbruster             ret = callback(rdma, errp);
1916329c9b10SDr. David Alan Gilbert             if (ret < 0) {
1917ec486974SMarkus Armbruster                 return -1;
1918329c9b10SDr. David Alan Gilbert             }
1919329c9b10SDr. David Alan Gilbert         }
1920329c9b10SDr. David Alan Gilbert 
1921482a33c5SDr. David Alan Gilbert         trace_qemu_rdma_exchange_send_waiting(control_desc(resp->type));
1922329c9b10SDr. David Alan Gilbert         ret = qemu_rdma_exchange_get_response(rdma, resp,
19233765ec1fSMarkus Armbruster                                               resp->type, RDMA_WRID_DATA,
19243765ec1fSMarkus Armbruster                                               errp);
1925329c9b10SDr. David Alan Gilbert 
1926329c9b10SDr. David Alan Gilbert         if (ret < 0) {
1927ec486974SMarkus Armbruster             return -1;
1928329c9b10SDr. David Alan Gilbert         }
1929329c9b10SDr. David Alan Gilbert 
1930329c9b10SDr. David Alan Gilbert         qemu_rdma_move_header(rdma, RDMA_WRID_DATA, resp);
1931329c9b10SDr. David Alan Gilbert         if (resp_idx) {
1932329c9b10SDr. David Alan Gilbert             *resp_idx = RDMA_WRID_DATA;
1933329c9b10SDr. David Alan Gilbert         }
1934482a33c5SDr. David Alan Gilbert         trace_qemu_rdma_exchange_send_received(control_desc(resp->type));
1935329c9b10SDr. David Alan Gilbert     }
1936329c9b10SDr. David Alan Gilbert 
1937329c9b10SDr. David Alan Gilbert     rdma->control_ready_expected = 1;
1938329c9b10SDr. David Alan Gilbert 
1939329c9b10SDr. David Alan Gilbert     return 0;
1940329c9b10SDr. David Alan Gilbert }
1941329c9b10SDr. David Alan Gilbert 
1942329c9b10SDr. David Alan Gilbert /*
1943329c9b10SDr. David Alan Gilbert  * This is an 'atomic' high-level operation to receive a single, unified
1944329c9b10SDr. David Alan Gilbert  * control-channel message.
1945329c9b10SDr. David Alan Gilbert  */
qemu_rdma_exchange_recv(RDMAContext * rdma,RDMAControlHeader * head,uint32_t expecting,Error ** errp)1946329c9b10SDr. David Alan Gilbert static int qemu_rdma_exchange_recv(RDMAContext *rdma, RDMAControlHeader *head,
194796f363d8SMarkus Armbruster                                    uint32_t expecting, Error **errp)
1948329c9b10SDr. David Alan Gilbert {
1949329c9b10SDr. David Alan Gilbert     RDMAControlHeader ready = {
1950329c9b10SDr. David Alan Gilbert                                 .len = 0,
1951329c9b10SDr. David Alan Gilbert                                 .type = RDMA_CONTROL_READY,
1952329c9b10SDr. David Alan Gilbert                                 .repeat = 1,
1953329c9b10SDr. David Alan Gilbert                               };
1954329c9b10SDr. David Alan Gilbert     int ret;
1955329c9b10SDr. David Alan Gilbert 
1956329c9b10SDr. David Alan Gilbert     /*
1957329c9b10SDr. David Alan Gilbert      * Inform the source that we're ready to receive a message.
1958329c9b10SDr. David Alan Gilbert      */
1959f3805964SMarkus Armbruster     ret = qemu_rdma_post_send_control(rdma, NULL, &ready, errp);
1960329c9b10SDr. David Alan Gilbert 
1961329c9b10SDr. David Alan Gilbert     if (ret < 0) {
1962ec486974SMarkus Armbruster         return -1;
1963329c9b10SDr. David Alan Gilbert     }
1964329c9b10SDr. David Alan Gilbert 
1965329c9b10SDr. David Alan Gilbert     /*
1966329c9b10SDr. David Alan Gilbert      * Block and wait for the message.
1967329c9b10SDr. David Alan Gilbert      */
1968329c9b10SDr. David Alan Gilbert     ret = qemu_rdma_exchange_get_response(rdma, head,
19693765ec1fSMarkus Armbruster                                           expecting, RDMA_WRID_READY, errp);
1970329c9b10SDr. David Alan Gilbert 
1971329c9b10SDr. David Alan Gilbert     if (ret < 0) {
1972ec486974SMarkus Armbruster         return -1;
1973329c9b10SDr. David Alan Gilbert     }
1974329c9b10SDr. David Alan Gilbert 
1975329c9b10SDr. David Alan Gilbert     qemu_rdma_move_header(rdma, RDMA_WRID_READY, head);
1976329c9b10SDr. David Alan Gilbert 
1977329c9b10SDr. David Alan Gilbert     /*
1978329c9b10SDr. David Alan Gilbert      * Post a new RECV work request to replace the one we just consumed.
1979329c9b10SDr. David Alan Gilbert      */
19803c0c3ebaSMarkus Armbruster     ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY, errp);
19814a102179SMarkus Armbruster     if (ret < 0) {
1982ec486974SMarkus Armbruster         return -1;
1983329c9b10SDr. David Alan Gilbert     }
1984329c9b10SDr. David Alan Gilbert 
1985329c9b10SDr. David Alan Gilbert     return 0;
1986329c9b10SDr. David Alan Gilbert }
1987329c9b10SDr. David Alan Gilbert 
1988329c9b10SDr. David Alan Gilbert /*
1989329c9b10SDr. David Alan Gilbert  * Write an actual chunk of memory using RDMA.
1990329c9b10SDr. David Alan Gilbert  *
1991329c9b10SDr. David Alan Gilbert  * If we're using dynamic registration on the dest-side, we have to
1992329c9b10SDr. David Alan Gilbert  * send a registration command first.
1993329c9b10SDr. David Alan Gilbert  */
qemu_rdma_write_one(RDMAContext * rdma,int current_index,uint64_t current_addr,uint64_t length,Error ** errp)1994e3378035SJuan Quintela static int qemu_rdma_write_one(RDMAContext *rdma,
1995329c9b10SDr. David Alan Gilbert                                int current_index, uint64_t current_addr,
1996557c34caSMarkus Armbruster                                uint64_t length, Error **errp)
1997329c9b10SDr. David Alan Gilbert {
1998329c9b10SDr. David Alan Gilbert     struct ibv_sge sge;
1999329c9b10SDr. David Alan Gilbert     struct ibv_send_wr send_wr = { 0 };
2000329c9b10SDr. David Alan Gilbert     struct ibv_send_wr *bad_wr;
2001329c9b10SDr. David Alan Gilbert     int reg_result_idx, ret, count = 0;
2002329c9b10SDr. David Alan Gilbert     uint64_t chunk, chunks;
2003329c9b10SDr. David Alan Gilbert     uint8_t *chunk_start, *chunk_end;
2004329c9b10SDr. David Alan Gilbert     RDMALocalBlock *block = &(rdma->local_ram_blocks.block[current_index]);
2005329c9b10SDr. David Alan Gilbert     RDMARegister reg;
2006329c9b10SDr. David Alan Gilbert     RDMARegisterResult *reg_result;
2007329c9b10SDr. David Alan Gilbert     RDMAControlHeader resp = { .type = RDMA_CONTROL_REGISTER_RESULT };
2008329c9b10SDr. David Alan Gilbert     RDMAControlHeader head = { .len = sizeof(RDMARegister),
2009329c9b10SDr. David Alan Gilbert                                .type = RDMA_CONTROL_REGISTER_REQUEST,
2010329c9b10SDr. David Alan Gilbert                                .repeat = 1,
2011329c9b10SDr. David Alan Gilbert                              };
2012329c9b10SDr. David Alan Gilbert 
2013329c9b10SDr. David Alan Gilbert retry:
2014fbce8c25SStefan Weil     sge.addr = (uintptr_t)(block->local_host_addr +
2015329c9b10SDr. David Alan Gilbert                             (current_addr - block->offset));
2016329c9b10SDr. David Alan Gilbert     sge.length = length;
2017329c9b10SDr. David Alan Gilbert 
2018fbce8c25SStefan Weil     chunk = ram_chunk_index(block->local_host_addr,
2019fbce8c25SStefan Weil                             (uint8_t *)(uintptr_t)sge.addr);
2020329c9b10SDr. David Alan Gilbert     chunk_start = ram_chunk_start(block, chunk);
2021329c9b10SDr. David Alan Gilbert 
2022329c9b10SDr. David Alan Gilbert     if (block->is_ram_block) {
2023329c9b10SDr. David Alan Gilbert         chunks = length / (1UL << RDMA_REG_CHUNK_SHIFT);
2024329c9b10SDr. David Alan Gilbert 
2025329c9b10SDr. David Alan Gilbert         if (chunks && ((length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
2026329c9b10SDr. David Alan Gilbert             chunks--;
2027329c9b10SDr. David Alan Gilbert         }
2028329c9b10SDr. David Alan Gilbert     } else {
2029329c9b10SDr. David Alan Gilbert         chunks = block->length / (1UL << RDMA_REG_CHUNK_SHIFT);
2030329c9b10SDr. David Alan Gilbert 
2031329c9b10SDr. David Alan Gilbert         if (chunks && ((block->length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
2032329c9b10SDr. David Alan Gilbert             chunks--;
2033329c9b10SDr. David Alan Gilbert         }
2034329c9b10SDr. David Alan Gilbert     }
2035329c9b10SDr. David Alan Gilbert 
2036733252deSDr. David Alan Gilbert     trace_qemu_rdma_write_one_top(chunks + 1,
2037733252deSDr. David Alan Gilbert                                   (chunks + 1) *
2038733252deSDr. David Alan Gilbert                                   (1UL << RDMA_REG_CHUNK_SHIFT) / 1024 / 1024);
2039329c9b10SDr. David Alan Gilbert 
2040329c9b10SDr. David Alan Gilbert     chunk_end = ram_chunk_end(block, chunk + chunks);
2041329c9b10SDr. David Alan Gilbert 
2042329c9b10SDr. David Alan Gilbert 
2043329c9b10SDr. David Alan Gilbert     while (test_bit(chunk, block->transit_bitmap)) {
2044329c9b10SDr. David Alan Gilbert         (void)count;
2045733252deSDr. David Alan Gilbert         trace_qemu_rdma_write_one_block(count++, current_index, chunk,
2046329c9b10SDr. David Alan Gilbert                 sge.addr, length, rdma->nb_sent, block->nb_chunks);
2047329c9b10SDr. David Alan Gilbert 
2048329c9b10SDr. David Alan Gilbert         ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2049329c9b10SDr. David Alan Gilbert 
2050329c9b10SDr. David Alan Gilbert         if (ret < 0) {
2051557c34caSMarkus Armbruster             error_setg(errp, "Failed to Wait for previous write to complete "
2052329c9b10SDr. David Alan Gilbert                     "block %d chunk %" PRIu64
2053733252deSDr. David Alan Gilbert                     " current %" PRIu64 " len %" PRIu64 " %d",
2054329c9b10SDr. David Alan Gilbert                     current_index, chunk, sge.addr, length, rdma->nb_sent);
2055ec486974SMarkus Armbruster             return -1;
2056329c9b10SDr. David Alan Gilbert         }
2057329c9b10SDr. David Alan Gilbert     }
2058329c9b10SDr. David Alan Gilbert 
2059329c9b10SDr. David Alan Gilbert     if (!rdma->pin_all || !block->is_ram_block) {
2060329c9b10SDr. David Alan Gilbert         if (!block->remote_keys[chunk]) {
2061329c9b10SDr. David Alan Gilbert             /*
2062329c9b10SDr. David Alan Gilbert              * This chunk has not yet been registered, so first check to see
2063329c9b10SDr. David Alan Gilbert              * if the entire chunk is zero. If so, tell the other size to
2064329c9b10SDr. David Alan Gilbert              * memset() + madvise() the entire chunk without RDMA.
2065329c9b10SDr. David Alan Gilbert              */
2066329c9b10SDr. David Alan Gilbert 
2067a1febc49SRichard Henderson             if (buffer_is_zero((void *)(uintptr_t)sge.addr, length)) {
2068329c9b10SDr. David Alan Gilbert                 RDMACompress comp = {
2069329c9b10SDr. David Alan Gilbert                                         .offset = current_addr,
2070329c9b10SDr. David Alan Gilbert                                         .value = 0,
2071329c9b10SDr. David Alan Gilbert                                         .block_idx = current_index,
2072329c9b10SDr. David Alan Gilbert                                         .length = length,
2073329c9b10SDr. David Alan Gilbert                                     };
2074329c9b10SDr. David Alan Gilbert 
2075329c9b10SDr. David Alan Gilbert                 head.len = sizeof(comp);
2076329c9b10SDr. David Alan Gilbert                 head.type = RDMA_CONTROL_COMPRESS;
2077329c9b10SDr. David Alan Gilbert 
2078733252deSDr. David Alan Gilbert                 trace_qemu_rdma_write_one_zero(chunk, sge.length,
2079733252deSDr. David Alan Gilbert                                                current_index, current_addr);
2080329c9b10SDr. David Alan Gilbert 
2081b12f7777SDr. David Alan Gilbert                 compress_to_network(rdma, &comp);
2082329c9b10SDr. David Alan Gilbert                 ret = qemu_rdma_exchange_send(rdma, &head,
2083557c34caSMarkus Armbruster                                 (uint8_t *) &comp, NULL, NULL, NULL, errp);
2084329c9b10SDr. David Alan Gilbert 
2085329c9b10SDr. David Alan Gilbert                 if (ret < 0) {
20868c6513f7SMarkus Armbruster                     return -1;
2087329c9b10SDr. David Alan Gilbert                 }
2088329c9b10SDr. David Alan Gilbert 
208967c31c9cSJuan Quintela                 /*
209067c31c9cSJuan Quintela                  * TODO: Here we are sending something, but we are not
209167c31c9cSJuan Quintela                  * accounting for anything transferred.  The following is wrong:
209267c31c9cSJuan Quintela                  *
209367c31c9cSJuan Quintela                  * stat64_add(&mig_stats.rdma_bytes, sge.length);
209467c31c9cSJuan Quintela                  *
209567c31c9cSJuan Quintela                  * because we are using some kind of compression.  I
209667c31c9cSJuan Quintela                  * would think that head.len would be the more similar
209767c31c9cSJuan Quintela                  * thing to a correct value.
209867c31c9cSJuan Quintela                  */
2099c61d2faaSJuan Quintela                 stat64_add(&mig_stats.zero_pages,
2100c61d2faaSJuan Quintela                            sge.length / qemu_target_page_size());
2101329c9b10SDr. David Alan Gilbert                 return 1;
2102329c9b10SDr. David Alan Gilbert             }
2103329c9b10SDr. David Alan Gilbert 
2104329c9b10SDr. David Alan Gilbert             /*
2105329c9b10SDr. David Alan Gilbert              * Otherwise, tell other side to register.
2106329c9b10SDr. David Alan Gilbert              */
2107329c9b10SDr. David Alan Gilbert             reg.current_index = current_index;
2108329c9b10SDr. David Alan Gilbert             if (block->is_ram_block) {
2109329c9b10SDr. David Alan Gilbert                 reg.key.current_addr = current_addr;
2110329c9b10SDr. David Alan Gilbert             } else {
2111329c9b10SDr. David Alan Gilbert                 reg.key.chunk = chunk;
2112329c9b10SDr. David Alan Gilbert             }
2113329c9b10SDr. David Alan Gilbert             reg.chunks = chunks;
2114329c9b10SDr. David Alan Gilbert 
2115733252deSDr. David Alan Gilbert             trace_qemu_rdma_write_one_sendreg(chunk, sge.length, current_index,
2116733252deSDr. David Alan Gilbert                                               current_addr);
2117329c9b10SDr. David Alan Gilbert 
2118b12f7777SDr. David Alan Gilbert             register_to_network(rdma, &reg);
2119329c9b10SDr. David Alan Gilbert             ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) &reg,
2120557c34caSMarkus Armbruster                                     &resp, &reg_result_idx, NULL, errp);
2121329c9b10SDr. David Alan Gilbert             if (ret < 0) {
2122ec486974SMarkus Armbruster                 return -1;
2123329c9b10SDr. David Alan Gilbert             }
2124329c9b10SDr. David Alan Gilbert 
2125329c9b10SDr. David Alan Gilbert             /* try to overlap this single registration with the one we sent. */
21263ac040c0SStefan Weil             if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2127329c9b10SDr. David Alan Gilbert                                                 &sge.lkey, NULL, chunk,
2128329c9b10SDr. David Alan Gilbert                                                 chunk_start, chunk_end)) {
2129557c34caSMarkus Armbruster                 error_setg(errp, "cannot get lkey");
21308c6513f7SMarkus Armbruster                 return -1;
2131329c9b10SDr. David Alan Gilbert             }
2132329c9b10SDr. David Alan Gilbert 
2133329c9b10SDr. David Alan Gilbert             reg_result = (RDMARegisterResult *)
2134329c9b10SDr. David Alan Gilbert                     rdma->wr_data[reg_result_idx].control_curr;
2135329c9b10SDr. David Alan Gilbert 
2136329c9b10SDr. David Alan Gilbert             network_to_result(reg_result);
2137329c9b10SDr. David Alan Gilbert 
2138733252deSDr. David Alan Gilbert             trace_qemu_rdma_write_one_recvregres(block->remote_keys[chunk],
2139733252deSDr. David Alan Gilbert                                                  reg_result->rkey, chunk);
2140329c9b10SDr. David Alan Gilbert 
2141329c9b10SDr. David Alan Gilbert             block->remote_keys[chunk] = reg_result->rkey;
2142329c9b10SDr. David Alan Gilbert             block->remote_host_addr = reg_result->host_addr;
2143329c9b10SDr. David Alan Gilbert         } else {
2144329c9b10SDr. David Alan Gilbert             /* already registered before */
21453ac040c0SStefan Weil             if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2146329c9b10SDr. David Alan Gilbert                                                 &sge.lkey, NULL, chunk,
2147329c9b10SDr. David Alan Gilbert                                                 chunk_start, chunk_end)) {
2148557c34caSMarkus Armbruster                 error_setg(errp, "cannot get lkey!");
21498c6513f7SMarkus Armbruster                 return -1;
2150329c9b10SDr. David Alan Gilbert             }
2151329c9b10SDr. David Alan Gilbert         }
2152329c9b10SDr. David Alan Gilbert 
2153329c9b10SDr. David Alan Gilbert         send_wr.wr.rdma.rkey = block->remote_keys[chunk];
2154329c9b10SDr. David Alan Gilbert     } else {
2155329c9b10SDr. David Alan Gilbert         send_wr.wr.rdma.rkey = block->remote_rkey;
2156329c9b10SDr. David Alan Gilbert 
21573ac040c0SStefan Weil         if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2158329c9b10SDr. David Alan Gilbert                                                      &sge.lkey, NULL, chunk,
2159329c9b10SDr. David Alan Gilbert                                                      chunk_start, chunk_end)) {
2160557c34caSMarkus Armbruster             error_setg(errp, "cannot get lkey!");
21618c6513f7SMarkus Armbruster             return -1;
2162329c9b10SDr. David Alan Gilbert         }
2163329c9b10SDr. David Alan Gilbert     }
2164329c9b10SDr. David Alan Gilbert 
2165329c9b10SDr. David Alan Gilbert     /*
2166329c9b10SDr. David Alan Gilbert      * Encode the ram block index and chunk within this wrid.
2167329c9b10SDr. David Alan Gilbert      * We will use this information at the time of completion
2168329c9b10SDr. David Alan Gilbert      * to figure out which bitmap to check against and then which
2169329c9b10SDr. David Alan Gilbert      * chunk in the bitmap to look for.
2170329c9b10SDr. David Alan Gilbert      */
2171329c9b10SDr. David Alan Gilbert     send_wr.wr_id = qemu_rdma_make_wrid(RDMA_WRID_RDMA_WRITE,
2172329c9b10SDr. David Alan Gilbert                                         current_index, chunk);
2173329c9b10SDr. David Alan Gilbert 
2174329c9b10SDr. David Alan Gilbert     send_wr.opcode = IBV_WR_RDMA_WRITE;
2175329c9b10SDr. David Alan Gilbert     send_wr.send_flags = IBV_SEND_SIGNALED;
2176329c9b10SDr. David Alan Gilbert     send_wr.sg_list = &sge;
2177329c9b10SDr. David Alan Gilbert     send_wr.num_sge = 1;
2178329c9b10SDr. David Alan Gilbert     send_wr.wr.rdma.remote_addr = block->remote_host_addr +
2179329c9b10SDr. David Alan Gilbert                                 (current_addr - block->offset);
2180329c9b10SDr. David Alan Gilbert 
2181733252deSDr. David Alan Gilbert     trace_qemu_rdma_write_one_post(chunk, sge.addr, send_wr.wr.rdma.remote_addr,
2182329c9b10SDr. David Alan Gilbert                                    sge.length);
2183329c9b10SDr. David Alan Gilbert 
2184329c9b10SDr. David Alan Gilbert     /*
2185329c9b10SDr. David Alan Gilbert      * ibv_post_send() does not return negative error numbers,
2186329c9b10SDr. David Alan Gilbert      * per the specification they are positive - no idea why.
2187329c9b10SDr. David Alan Gilbert      */
2188329c9b10SDr. David Alan Gilbert     ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
2189329c9b10SDr. David Alan Gilbert 
2190329c9b10SDr. David Alan Gilbert     if (ret == ENOMEM) {
2191733252deSDr. David Alan Gilbert         trace_qemu_rdma_write_one_queue_full();
2192329c9b10SDr. David Alan Gilbert         ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2193329c9b10SDr. David Alan Gilbert         if (ret < 0) {
2194557c34caSMarkus Armbruster             error_setg(errp, "rdma migration: failed to make "
21951b6e1da6SMarkus Armbruster                          "room in full send queue!");
2196ec486974SMarkus Armbruster             return -1;
2197329c9b10SDr. David Alan Gilbert         }
2198329c9b10SDr. David Alan Gilbert 
2199329c9b10SDr. David Alan Gilbert         goto retry;
2200329c9b10SDr. David Alan Gilbert 
2201329c9b10SDr. David Alan Gilbert     } else if (ret > 0) {
2202557c34caSMarkus Armbruster         error_setg_errno(errp, ret,
2203557c34caSMarkus Armbruster                          "rdma migration: post rdma write failed");
22048c6513f7SMarkus Armbruster         return -1;
2205329c9b10SDr. David Alan Gilbert     }
2206329c9b10SDr. David Alan Gilbert 
2207329c9b10SDr. David Alan Gilbert     set_bit(chunk, block->transit_bitmap);
22085690756dSJuan Quintela     stat64_add(&mig_stats.normal_pages, sge.length / qemu_target_page_size());
220967c31c9cSJuan Quintela     /*
221067c31c9cSJuan Quintela      * We are adding to transferred the amount of data written, but no
2211e3fc6934SMichael Tokarev      * overhead at all.  I will assume that RDMA is magicaly and don't
221267c31c9cSJuan Quintela      * need to transfer (at least) the addresses where it wants to
221367c31c9cSJuan Quintela      * write the pages.  Here it looks like it should be something
221467c31c9cSJuan Quintela      * like:
221567c31c9cSJuan Quintela      *     sizeof(send_wr) + sge.length
221667c31c9cSJuan Quintela      * but this being RDMA, who knows.
221767c31c9cSJuan Quintela      */
221867c31c9cSJuan Quintela     stat64_add(&mig_stats.rdma_bytes, sge.length);
22195690756dSJuan Quintela     ram_transferred_add(sge.length);
2220329c9b10SDr. David Alan Gilbert     rdma->total_writes++;
2221329c9b10SDr. David Alan Gilbert 
2222329c9b10SDr. David Alan Gilbert     return 0;
2223329c9b10SDr. David Alan Gilbert }
2224329c9b10SDr. David Alan Gilbert 
2225329c9b10SDr. David Alan Gilbert /*
2226329c9b10SDr. David Alan Gilbert  * Push out any unwritten RDMA operations.
2227329c9b10SDr. David Alan Gilbert  *
2228329c9b10SDr. David Alan Gilbert  * We support sending out multiple chunks at the same time.
2229329c9b10SDr. David Alan Gilbert  * Not all of them need to get signaled in the completion queue.
2230329c9b10SDr. David Alan Gilbert  */
qemu_rdma_write_flush(RDMAContext * rdma,Error ** errp)223156095477SMarkus Armbruster static int qemu_rdma_write_flush(RDMAContext *rdma, Error **errp)
2232329c9b10SDr. David Alan Gilbert {
2233329c9b10SDr. David Alan Gilbert     int ret;
2234329c9b10SDr. David Alan Gilbert 
2235329c9b10SDr. David Alan Gilbert     if (!rdma->current_length) {
2236329c9b10SDr. David Alan Gilbert         return 0;
2237329c9b10SDr. David Alan Gilbert     }
2238329c9b10SDr. David Alan Gilbert 
2239557c34caSMarkus Armbruster     ret = qemu_rdma_write_one(rdma, rdma->current_index, rdma->current_addr,
2240557c34caSMarkus Armbruster                               rdma->current_length, errp);
2241329c9b10SDr. David Alan Gilbert 
2242329c9b10SDr. David Alan Gilbert     if (ret < 0) {
2243ec486974SMarkus Armbruster         return -1;
2244329c9b10SDr. David Alan Gilbert     }
2245329c9b10SDr. David Alan Gilbert 
2246329c9b10SDr. David Alan Gilbert     if (ret == 0) {
2247329c9b10SDr. David Alan Gilbert         rdma->nb_sent++;
2248733252deSDr. David Alan Gilbert         trace_qemu_rdma_write_flush(rdma->nb_sent);
2249329c9b10SDr. David Alan Gilbert     }
2250329c9b10SDr. David Alan Gilbert 
2251329c9b10SDr. David Alan Gilbert     rdma->current_length = 0;
2252329c9b10SDr. David Alan Gilbert     rdma->current_addr = 0;
2253329c9b10SDr. David Alan Gilbert 
2254329c9b10SDr. David Alan Gilbert     return 0;
2255329c9b10SDr. David Alan Gilbert }
2256329c9b10SDr. David Alan Gilbert 
qemu_rdma_buffer_mergeable(RDMAContext * rdma,uint64_t offset,uint64_t len)22576a3792d7SMarkus Armbruster static inline bool qemu_rdma_buffer_mergeable(RDMAContext *rdma,
2258329c9b10SDr. David Alan Gilbert                     uint64_t offset, uint64_t len)
2259329c9b10SDr. David Alan Gilbert {
2260329c9b10SDr. David Alan Gilbert     RDMALocalBlock *block;
2261329c9b10SDr. David Alan Gilbert     uint8_t *host_addr;
2262329c9b10SDr. David Alan Gilbert     uint8_t *chunk_end;
2263329c9b10SDr. David Alan Gilbert 
2264329c9b10SDr. David Alan Gilbert     if (rdma->current_index < 0) {
22656a3792d7SMarkus Armbruster         return false;
2266329c9b10SDr. David Alan Gilbert     }
2267329c9b10SDr. David Alan Gilbert 
2268329c9b10SDr. David Alan Gilbert     if (rdma->current_chunk < 0) {
22696a3792d7SMarkus Armbruster         return false;
2270329c9b10SDr. David Alan Gilbert     }
2271329c9b10SDr. David Alan Gilbert 
2272329c9b10SDr. David Alan Gilbert     block = &(rdma->local_ram_blocks.block[rdma->current_index]);
2273329c9b10SDr. David Alan Gilbert     host_addr = block->local_host_addr + (offset - block->offset);
2274329c9b10SDr. David Alan Gilbert     chunk_end = ram_chunk_end(block, rdma->current_chunk);
2275329c9b10SDr. David Alan Gilbert 
2276329c9b10SDr. David Alan Gilbert     if (rdma->current_length == 0) {
22776a3792d7SMarkus Armbruster         return false;
2278329c9b10SDr. David Alan Gilbert     }
2279329c9b10SDr. David Alan Gilbert 
2280329c9b10SDr. David Alan Gilbert     /*
2281329c9b10SDr. David Alan Gilbert      * Only merge into chunk sequentially.
2282329c9b10SDr. David Alan Gilbert      */
2283329c9b10SDr. David Alan Gilbert     if (offset != (rdma->current_addr + rdma->current_length)) {
22846a3792d7SMarkus Armbruster         return false;
2285329c9b10SDr. David Alan Gilbert     }
2286329c9b10SDr. David Alan Gilbert 
2287329c9b10SDr. David Alan Gilbert     if (offset < block->offset) {
22886a3792d7SMarkus Armbruster         return false;
2289329c9b10SDr. David Alan Gilbert     }
2290329c9b10SDr. David Alan Gilbert 
2291329c9b10SDr. David Alan Gilbert     if ((offset + len) > (block->offset + block->length)) {
22926a3792d7SMarkus Armbruster         return false;
2293329c9b10SDr. David Alan Gilbert     }
2294329c9b10SDr. David Alan Gilbert 
2295329c9b10SDr. David Alan Gilbert     if ((host_addr + len) > chunk_end) {
22966a3792d7SMarkus Armbruster         return false;
2297329c9b10SDr. David Alan Gilbert     }
2298329c9b10SDr. David Alan Gilbert 
22996a3792d7SMarkus Armbruster     return true;
2300329c9b10SDr. David Alan Gilbert }
2301329c9b10SDr. David Alan Gilbert 
2302329c9b10SDr. David Alan Gilbert /*
2303329c9b10SDr. David Alan Gilbert  * We're not actually writing here, but doing three things:
2304329c9b10SDr. David Alan Gilbert  *
2305329c9b10SDr. David Alan Gilbert  * 1. Identify the chunk the buffer belongs to.
2306329c9b10SDr. David Alan Gilbert  * 2. If the chunk is full or the buffer doesn't belong to the current
2307329c9b10SDr. David Alan Gilbert  *    chunk, then start a new chunk and flush() the old chunk.
2308329c9b10SDr. David Alan Gilbert  * 3. To keep the hardware busy, we also group chunks into batches
2309329c9b10SDr. David Alan Gilbert  *    and only require that a batch gets acknowledged in the completion
23103a4452d8Szhaolichang  *    queue instead of each individual chunk.
2311329c9b10SDr. David Alan Gilbert  */
qemu_rdma_write(RDMAContext * rdma,uint64_t block_offset,uint64_t offset,uint64_t len,Error ** errp)2312e3378035SJuan Quintela static int qemu_rdma_write(RDMAContext *rdma,
2313329c9b10SDr. David Alan Gilbert                            uint64_t block_offset, uint64_t offset,
2314446e559cSMarkus Armbruster                            uint64_t len, Error **errp)
2315329c9b10SDr. David Alan Gilbert {
2316329c9b10SDr. David Alan Gilbert     uint64_t current_addr = block_offset + offset;
2317329c9b10SDr. David Alan Gilbert     uint64_t index = rdma->current_index;
2318329c9b10SDr. David Alan Gilbert     uint64_t chunk = rdma->current_chunk;
2319329c9b10SDr. David Alan Gilbert 
2320329c9b10SDr. David Alan Gilbert     /* If we cannot merge it, we flush the current buffer first. */
23216a3792d7SMarkus Armbruster     if (!qemu_rdma_buffer_mergeable(rdma, current_addr, len)) {
23228f5a7faaSJuan Quintela         if (qemu_rdma_write_flush(rdma, errp) < 0) {
2323ec486974SMarkus Armbruster             return -1;
2324329c9b10SDr. David Alan Gilbert         }
2325329c9b10SDr. David Alan Gilbert         rdma->current_length = 0;
2326329c9b10SDr. David Alan Gilbert         rdma->current_addr = current_addr;
2327329c9b10SDr. David Alan Gilbert 
232887e6bdabSMarkus Armbruster         qemu_rdma_search_ram_block(rdma, block_offset,
2329329c9b10SDr. David Alan Gilbert                                    offset, len, &index, &chunk);
2330329c9b10SDr. David Alan Gilbert         rdma->current_index = index;
2331329c9b10SDr. David Alan Gilbert         rdma->current_chunk = chunk;
2332329c9b10SDr. David Alan Gilbert     }
2333329c9b10SDr. David Alan Gilbert 
2334329c9b10SDr. David Alan Gilbert     /* merge it */
2335329c9b10SDr. David Alan Gilbert     rdma->current_length += len;
2336329c9b10SDr. David Alan Gilbert 
2337329c9b10SDr. David Alan Gilbert     /* flush it if buffer is too large */
2338329c9b10SDr. David Alan Gilbert     if (rdma->current_length >= RDMA_MERGE_MAX) {
2339446e559cSMarkus Armbruster         return qemu_rdma_write_flush(rdma, errp);
2340329c9b10SDr. David Alan Gilbert     }
2341329c9b10SDr. David Alan Gilbert 
2342329c9b10SDr. David Alan Gilbert     return 0;
2343329c9b10SDr. David Alan Gilbert }
2344329c9b10SDr. David Alan Gilbert 
qemu_rdma_cleanup(RDMAContext * rdma)2345329c9b10SDr. David Alan Gilbert static void qemu_rdma_cleanup(RDMAContext *rdma)
2346329c9b10SDr. David Alan Gilbert {
2347f3805964SMarkus Armbruster     Error *err = NULL;
2348329c9b10SDr. David Alan Gilbert 
2349329c9b10SDr. David Alan Gilbert     if (rdma->cm_id && rdma->connected) {
2350b86c94a4SMarkus Armbruster         if ((rdma->errored ||
235132bce196SDr. David Alan Gilbert              migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) &&
235232bce196SDr. David Alan Gilbert             !rdma->received_error) {
2353329c9b10SDr. David Alan Gilbert             RDMAControlHeader head = { .len = 0,
2354329c9b10SDr. David Alan Gilbert                                        .type = RDMA_CONTROL_ERROR,
2355329c9b10SDr. David Alan Gilbert                                        .repeat = 1,
2356329c9b10SDr. David Alan Gilbert                                      };
23575cec563dSMarkus Armbruster             warn_report("Early error. Sending error.");
2358f3805964SMarkus Armbruster             if (qemu_rdma_post_send_control(rdma, NULL, &head, &err) < 0) {
23595cec563dSMarkus Armbruster                 warn_report_err(err);
2360f3805964SMarkus Armbruster             }
2361329c9b10SDr. David Alan Gilbert         }
2362329c9b10SDr. David Alan Gilbert 
2363c5e76115SLidong Chen         rdma_disconnect(rdma->cm_id);
2364733252deSDr. David Alan Gilbert         trace_qemu_rdma_cleanup_disconnect();
2365329c9b10SDr. David Alan Gilbert         rdma->connected = false;
2366329c9b10SDr. David Alan Gilbert     }
2367329c9b10SDr. David Alan Gilbert 
2368cf75e268SDr. David Alan Gilbert     if (rdma->channel) {
2369fbbaacabSDr. David Alan Gilbert         qemu_set_fd_handler(rdma->channel->fd, NULL, NULL, NULL);
2370cf75e268SDr. David Alan Gilbert     }
2371a97270adSDr. David Alan Gilbert     g_free(rdma->dest_blocks);
2372a97270adSDr. David Alan Gilbert     rdma->dest_blocks = NULL;
2373329c9b10SDr. David Alan Gilbert 
2374ebdb85f9SJuan Quintela     for (int i = 0; i < RDMA_WRID_MAX; i++) {
2375ebdb85f9SJuan Quintela         if (rdma->wr_data[i].control_mr) {
2376329c9b10SDr. David Alan Gilbert             rdma->total_registrations--;
2377ebdb85f9SJuan Quintela             ibv_dereg_mr(rdma->wr_data[i].control_mr);
2378329c9b10SDr. David Alan Gilbert         }
2379ebdb85f9SJuan Quintela         rdma->wr_data[i].control_mr = NULL;
2380329c9b10SDr. David Alan Gilbert     }
2381329c9b10SDr. David Alan Gilbert 
2382329c9b10SDr. David Alan Gilbert     if (rdma->local_ram_blocks.block) {
2383329c9b10SDr. David Alan Gilbert         while (rdma->local_ram_blocks.nb_blocks) {
238403fcab38SDr. David Alan Gilbert             rdma_delete_block(rdma, &rdma->local_ram_blocks.block[0]);
2385329c9b10SDr. David Alan Gilbert         }
2386329c9b10SDr. David Alan Gilbert     }
2387329c9b10SDr. David Alan Gilbert 
238880b262e1SPadmanabh Ratnakar     if (rdma->qp) {
238980b262e1SPadmanabh Ratnakar         rdma_destroy_qp(rdma->cm_id);
239080b262e1SPadmanabh Ratnakar         rdma->qp = NULL;
239180b262e1SPadmanabh Ratnakar     }
2392b390afd8SLi Zhijian     if (rdma->recv_cq) {
2393b390afd8SLi Zhijian         ibv_destroy_cq(rdma->recv_cq);
2394b390afd8SLi Zhijian         rdma->recv_cq = NULL;
2395329c9b10SDr. David Alan Gilbert     }
2396b390afd8SLi Zhijian     if (rdma->send_cq) {
2397b390afd8SLi Zhijian         ibv_destroy_cq(rdma->send_cq);
2398b390afd8SLi Zhijian         rdma->send_cq = NULL;
2399b390afd8SLi Zhijian     }
2400b390afd8SLi Zhijian     if (rdma->recv_comp_channel) {
2401b390afd8SLi Zhijian         ibv_destroy_comp_channel(rdma->recv_comp_channel);
2402b390afd8SLi Zhijian         rdma->recv_comp_channel = NULL;
2403b390afd8SLi Zhijian     }
2404b390afd8SLi Zhijian     if (rdma->send_comp_channel) {
2405b390afd8SLi Zhijian         ibv_destroy_comp_channel(rdma->send_comp_channel);
2406b390afd8SLi Zhijian         rdma->send_comp_channel = NULL;
2407329c9b10SDr. David Alan Gilbert     }
2408329c9b10SDr. David Alan Gilbert     if (rdma->pd) {
2409329c9b10SDr. David Alan Gilbert         ibv_dealloc_pd(rdma->pd);
2410329c9b10SDr. David Alan Gilbert         rdma->pd = NULL;
2411329c9b10SDr. David Alan Gilbert     }
241280b262e1SPadmanabh Ratnakar     if (rdma->cm_id) {
241380b262e1SPadmanabh Ratnakar         rdma_destroy_id(rdma->cm_id);
241480b262e1SPadmanabh Ratnakar         rdma->cm_id = NULL;
241580b262e1SPadmanabh Ratnakar     }
241655cc1b59SLidong Chen 
241755cc1b59SLidong Chen     /* the destination side, listen_id and channel is shared */
2418329c9b10SDr. David Alan Gilbert     if (rdma->listen_id) {
241955cc1b59SLidong Chen         if (!rdma->is_return_path) {
2420329c9b10SDr. David Alan Gilbert             rdma_destroy_id(rdma->listen_id);
2421329c9b10SDr. David Alan Gilbert         }
242255cc1b59SLidong Chen         rdma->listen_id = NULL;
242355cc1b59SLidong Chen 
242455cc1b59SLidong Chen         if (rdma->channel) {
242555cc1b59SLidong Chen             if (!rdma->is_return_path) {
242655cc1b59SLidong Chen                 rdma_destroy_event_channel(rdma->channel);
242755cc1b59SLidong Chen             }
242855cc1b59SLidong Chen             rdma->channel = NULL;
242955cc1b59SLidong Chen         }
243055cc1b59SLidong Chen     }
243155cc1b59SLidong Chen 
2432329c9b10SDr. David Alan Gilbert     if (rdma->channel) {
2433329c9b10SDr. David Alan Gilbert         rdma_destroy_event_channel(rdma->channel);
2434329c9b10SDr. David Alan Gilbert         rdma->channel = NULL;
2435329c9b10SDr. David Alan Gilbert     }
2436329c9b10SDr. David Alan Gilbert     g_free(rdma->host);
2437329c9b10SDr. David Alan Gilbert     rdma->host = NULL;
2438329c9b10SDr. David Alan Gilbert }
2439329c9b10SDr. David Alan Gilbert 
2440329c9b10SDr. David Alan Gilbert 
qemu_rdma_source_init(RDMAContext * rdma,bool pin_all,Error ** errp)2441bbfb89e3SFam Zheng static int qemu_rdma_source_init(RDMAContext *rdma, bool pin_all, Error **errp)
2442329c9b10SDr. David Alan Gilbert {
2443ebdb85f9SJuan Quintela     int ret;
2444329c9b10SDr. David Alan Gilbert 
2445329c9b10SDr. David Alan Gilbert     /*
2446329c9b10SDr. David Alan Gilbert      * Will be validated against destination's actual capabilities
2447329c9b10SDr. David Alan Gilbert      * after the connect() completes.
2448329c9b10SDr. David Alan Gilbert      */
2449329c9b10SDr. David Alan Gilbert     rdma->pin_all = pin_all;
2450329c9b10SDr. David Alan Gilbert 
2451b16defbbSMarkus Armbruster     ret = qemu_rdma_resolve_host(rdma, errp);
24524a102179SMarkus Armbruster     if (ret < 0) {
2453329c9b10SDr. David Alan Gilbert         goto err_rdma_source_init;
2454329c9b10SDr. David Alan Gilbert     }
2455329c9b10SDr. David Alan Gilbert 
245607d5b946SMarkus Armbruster     ret = qemu_rdma_alloc_pd_cq(rdma, errp);
24574a102179SMarkus Armbruster     if (ret < 0) {
2458329c9b10SDr. David Alan Gilbert         goto err_rdma_source_init;
2459329c9b10SDr. David Alan Gilbert     }
2460329c9b10SDr. David Alan Gilbert 
2461329c9b10SDr. David Alan Gilbert     ret = qemu_rdma_alloc_qp(rdma);
24624a102179SMarkus Armbruster     if (ret < 0) {
24638fd471bdSMarkus Armbruster         error_setg(errp, "RDMA ERROR: rdma migration: error allocating qp!");
2464329c9b10SDr. David Alan Gilbert         goto err_rdma_source_init;
2465329c9b10SDr. David Alan Gilbert     }
2466329c9b10SDr. David Alan Gilbert 
24670610d7a1SMarkus Armbruster     qemu_rdma_init_ram_blocks(rdma);
2468329c9b10SDr. David Alan Gilbert 
2469760ff4beSDr. David Alan Gilbert     /* Build the hash that maps from offset to RAMBlock */
2470760ff4beSDr. David Alan Gilbert     rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal);
2471ebdb85f9SJuan Quintela     for (int i = 0; i < rdma->local_ram_blocks.nb_blocks; i++) {
2472760ff4beSDr. David Alan Gilbert         g_hash_table_insert(rdma->blockmap,
2473ebdb85f9SJuan Quintela                 (void *)(uintptr_t)rdma->local_ram_blocks.block[i].offset,
2474ebdb85f9SJuan Quintela                 &rdma->local_ram_blocks.block[i]);
2475760ff4beSDr. David Alan Gilbert     }
2476760ff4beSDr. David Alan Gilbert 
2477ebdb85f9SJuan Quintela     for (int i = 0; i < RDMA_WRID_MAX; i++) {
2478ebdb85f9SJuan Quintela         ret = qemu_rdma_reg_control(rdma, i);
24794a102179SMarkus Armbruster         if (ret < 0) {
2480ebdb85f9SJuan Quintela             error_setg(errp, "RDMA ERROR: rdma migration: error "
2481ebdb85f9SJuan Quintela                        "registering %d control!", i);
2482329c9b10SDr. David Alan Gilbert             goto err_rdma_source_init;
2483329c9b10SDr. David Alan Gilbert         }
2484329c9b10SDr. David Alan Gilbert     }
2485329c9b10SDr. David Alan Gilbert 
2486329c9b10SDr. David Alan Gilbert     return 0;
2487329c9b10SDr. David Alan Gilbert 
2488329c9b10SDr. David Alan Gilbert err_rdma_source_init:
2489329c9b10SDr. David Alan Gilbert     qemu_rdma_cleanup(rdma);
2490329c9b10SDr. David Alan Gilbert     return -1;
2491329c9b10SDr. David Alan Gilbert }
2492329c9b10SDr. David Alan Gilbert 
qemu_get_cm_event_timeout(RDMAContext * rdma,struct rdma_cm_event ** cm_event,long msec,Error ** errp)2493e49e49ddSLi Zhijian static int qemu_get_cm_event_timeout(RDMAContext *rdma,
2494e49e49ddSLi Zhijian                                      struct rdma_cm_event **cm_event,
2495e49e49ddSLi Zhijian                                      long msec, Error **errp)
2496e49e49ddSLi Zhijian {
2497e49e49ddSLi Zhijian     int ret;
2498e49e49ddSLi Zhijian     struct pollfd poll_fd = {
2499e49e49ddSLi Zhijian                                 .fd = rdma->channel->fd,
2500e49e49ddSLi Zhijian                                 .events = POLLIN,
2501e49e49ddSLi Zhijian                                 .revents = 0
2502e49e49ddSLi Zhijian                             };
2503e49e49ddSLi Zhijian 
2504e49e49ddSLi Zhijian     do {
2505e49e49ddSLi Zhijian         ret = poll(&poll_fd, 1, msec);
2506e49e49ddSLi Zhijian     } while (ret < 0 && errno == EINTR);
2507e49e49ddSLi Zhijian 
2508e49e49ddSLi Zhijian     if (ret == 0) {
25098fd471bdSMarkus Armbruster         error_setg(errp, "RDMA ERROR: poll cm event timeout");
2510e49e49ddSLi Zhijian         return -1;
2511e49e49ddSLi Zhijian     } else if (ret < 0) {
25128fd471bdSMarkus Armbruster         error_setg(errp, "RDMA ERROR: failed to poll cm event, errno=%i",
25138fd471bdSMarkus Armbruster                    errno);
2514e49e49ddSLi Zhijian         return -1;
2515e49e49ddSLi Zhijian     } else if (poll_fd.revents & POLLIN) {
2516f35c0d9bSMarkus Armbruster         if (rdma_get_cm_event(rdma->channel, cm_event) < 0) {
25178fd471bdSMarkus Armbruster             error_setg(errp, "RDMA ERROR: failed to get cm event");
2518f35c0d9bSMarkus Armbruster             return -1;
2519f35c0d9bSMarkus Armbruster         }
2520f35c0d9bSMarkus Armbruster         return 0;
2521e49e49ddSLi Zhijian     } else {
25228fd471bdSMarkus Armbruster         error_setg(errp, "RDMA ERROR: no POLLIN event, revent=%x",
25238fd471bdSMarkus Armbruster                    poll_fd.revents);
2524e49e49ddSLi Zhijian         return -1;
2525e49e49ddSLi Zhijian     }
2526e49e49ddSLi Zhijian }
2527e49e49ddSLi Zhijian 
qemu_rdma_connect(RDMAContext * rdma,bool return_path,Error ** errp)25283c03f21cSMarkus Armbruster static int qemu_rdma_connect(RDMAContext *rdma, bool return_path,
25293c03f21cSMarkus Armbruster                              Error **errp)
2530329c9b10SDr. David Alan Gilbert {
2531329c9b10SDr. David Alan Gilbert     RDMACapabilities cap = {
2532329c9b10SDr. David Alan Gilbert                                 .version = RDMA_CONTROL_VERSION_CURRENT,
2533329c9b10SDr. David Alan Gilbert                                 .flags = 0,
2534329c9b10SDr. David Alan Gilbert                            };
2535329c9b10SDr. David Alan Gilbert     struct rdma_conn_param conn_param = { .initiator_depth = 2,
2536329c9b10SDr. David Alan Gilbert                                           .retry_count = 5,
2537329c9b10SDr. David Alan Gilbert                                           .private_data = &cap,
2538329c9b10SDr. David Alan Gilbert                                           .private_data_len = sizeof(cap),
2539329c9b10SDr. David Alan Gilbert                                         };
2540329c9b10SDr. David Alan Gilbert     struct rdma_cm_event *cm_event;
2541329c9b10SDr. David Alan Gilbert     int ret;
2542329c9b10SDr. David Alan Gilbert 
2543329c9b10SDr. David Alan Gilbert     /*
2544329c9b10SDr. David Alan Gilbert      * Only negotiate the capability with destination if the user
2545329c9b10SDr. David Alan Gilbert      * on the source first requested the capability.
2546329c9b10SDr. David Alan Gilbert      */
2547329c9b10SDr. David Alan Gilbert     if (rdma->pin_all) {
2548733252deSDr. David Alan Gilbert         trace_qemu_rdma_connect_pin_all_requested();
2549329c9b10SDr. David Alan Gilbert         cap.flags |= RDMA_CAPABILITY_PIN_ALL;
2550329c9b10SDr. David Alan Gilbert     }
2551329c9b10SDr. David Alan Gilbert 
2552329c9b10SDr. David Alan Gilbert     caps_to_network(&cap);
2553329c9b10SDr. David Alan Gilbert 
25543c0c3ebaSMarkus Armbruster     ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY, errp);
25554a102179SMarkus Armbruster     if (ret < 0) {
25569cf2bab2SDr. David Alan Gilbert         goto err_rdma_source_connect;
25579cf2bab2SDr. David Alan Gilbert     }
25589cf2bab2SDr. David Alan Gilbert 
2559329c9b10SDr. David Alan Gilbert     ret = rdma_connect(rdma->cm_id, &conn_param);
25604a102179SMarkus Armbruster     if (ret < 0) {
256135b1561eSMarkus Armbruster         error_setg_errno(errp, errno,
256235b1561eSMarkus Armbruster                          "RDMA ERROR: connecting to destination!");
2563329c9b10SDr. David Alan Gilbert         goto err_rdma_source_connect;
2564329c9b10SDr. David Alan Gilbert     }
2565329c9b10SDr. David Alan Gilbert 
2566e49e49ddSLi Zhijian     if (return_path) {
2567e49e49ddSLi Zhijian         ret = qemu_get_cm_event_timeout(rdma, &cm_event, 5000, errp);
2568e49e49ddSLi Zhijian     } else {
2569329c9b10SDr. David Alan Gilbert         ret = rdma_get_cm_event(rdma->channel, &cm_event);
2570f35c0d9bSMarkus Armbruster         if (ret < 0) {
257135b1561eSMarkus Armbruster             error_setg_errno(errp, errno,
257235b1561eSMarkus Armbruster                              "RDMA ERROR: failed to get cm event");
25738fd471bdSMarkus Armbruster         }
2574f35c0d9bSMarkus Armbruster     }
25754a102179SMarkus Armbruster     if (ret < 0) {
2576329c9b10SDr. David Alan Gilbert         goto err_rdma_source_connect;
2577329c9b10SDr. David Alan Gilbert     }
2578329c9b10SDr. David Alan Gilbert 
2579329c9b10SDr. David Alan Gilbert     if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
25808fd471bdSMarkus Armbruster         error_setg(errp, "RDMA ERROR: connecting to destination!");
2581329c9b10SDr. David Alan Gilbert         rdma_ack_cm_event(cm_event);
2582329c9b10SDr. David Alan Gilbert         goto err_rdma_source_connect;
2583329c9b10SDr. David Alan Gilbert     }
2584329c9b10SDr. David Alan Gilbert     rdma->connected = true;
2585329c9b10SDr. David Alan Gilbert 
2586329c9b10SDr. David Alan Gilbert     memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
2587329c9b10SDr. David Alan Gilbert     network_to_caps(&cap);
2588329c9b10SDr. David Alan Gilbert 
2589329c9b10SDr. David Alan Gilbert     /*
2590329c9b10SDr. David Alan Gilbert      * Verify that the *requested* capabilities are supported by the destination
2591329c9b10SDr. David Alan Gilbert      * and disable them otherwise.
2592329c9b10SDr. David Alan Gilbert      */
2593329c9b10SDr. David Alan Gilbert     if (rdma->pin_all && !(cap.flags & RDMA_CAPABILITY_PIN_ALL)) {
2594e518b005SMarkus Armbruster         warn_report("RDMA: Server cannot support pinning all memory. "
2595329c9b10SDr. David Alan Gilbert                     "Will register memory dynamically.");
2596329c9b10SDr. David Alan Gilbert         rdma->pin_all = false;
2597329c9b10SDr. David Alan Gilbert     }
2598329c9b10SDr. David Alan Gilbert 
2599733252deSDr. David Alan Gilbert     trace_qemu_rdma_connect_pin_all_outcome(rdma->pin_all);
2600329c9b10SDr. David Alan Gilbert 
2601329c9b10SDr. David Alan Gilbert     rdma_ack_cm_event(cm_event);
2602329c9b10SDr. David Alan Gilbert 
2603329c9b10SDr. David Alan Gilbert     rdma->control_ready_expected = 1;
2604329c9b10SDr. David Alan Gilbert     rdma->nb_sent = 0;
2605329c9b10SDr. David Alan Gilbert     return 0;
2606329c9b10SDr. David Alan Gilbert 
2607329c9b10SDr. David Alan Gilbert err_rdma_source_connect:
2608329c9b10SDr. David Alan Gilbert     qemu_rdma_cleanup(rdma);
2609329c9b10SDr. David Alan Gilbert     return -1;
2610329c9b10SDr. David Alan Gilbert }
2611329c9b10SDr. David Alan Gilbert 
qemu_rdma_dest_init(RDMAContext * rdma,Error ** errp)2612329c9b10SDr. David Alan Gilbert static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp)
2613329c9b10SDr. David Alan Gilbert {
2614071d5ae4SMarkus Armbruster     Error *err = NULL;
2615ebdb85f9SJuan Quintela     int ret;
2616329c9b10SDr. David Alan Gilbert     struct rdma_cm_id *listen_id;
2617329c9b10SDr. David Alan Gilbert     char ip[40] = "unknown";
26181dbd2fd9SMichael Tokarev     struct rdma_addrinfo *res, *e;
2619329c9b10SDr. David Alan Gilbert     char port_str[16];
2620f736e414SJack Wang     int reuse = 1;
2621329c9b10SDr. David Alan Gilbert 
2622ebdb85f9SJuan Quintela     for (int i = 0; i < RDMA_WRID_MAX; i++) {
2623ebdb85f9SJuan Quintela         rdma->wr_data[i].control_len = 0;
2624ebdb85f9SJuan Quintela         rdma->wr_data[i].control_curr = NULL;
2625329c9b10SDr. David Alan Gilbert     }
2626329c9b10SDr. David Alan Gilbert 
26271dbd2fd9SMichael Tokarev     if (!rdma->host || !rdma->host[0]) {
26288fd471bdSMarkus Armbruster         error_setg(errp, "RDMA ERROR: RDMA host is not set!");
2629b86c94a4SMarkus Armbruster         rdma->errored = true;
2630329c9b10SDr. David Alan Gilbert         return -1;
2631329c9b10SDr. David Alan Gilbert     }
2632329c9b10SDr. David Alan Gilbert     /* create CM channel */
2633329c9b10SDr. David Alan Gilbert     rdma->channel = rdma_create_event_channel();
2634329c9b10SDr. David Alan Gilbert     if (!rdma->channel) {
26358fd471bdSMarkus Armbruster         error_setg(errp, "RDMA ERROR: could not create rdma event channel");
2636b86c94a4SMarkus Armbruster         rdma->errored = true;
2637329c9b10SDr. David Alan Gilbert         return -1;
2638329c9b10SDr. David Alan Gilbert     }
2639329c9b10SDr. David Alan Gilbert 
2640329c9b10SDr. David Alan Gilbert     /* create CM id */
2641329c9b10SDr. David Alan Gilbert     ret = rdma_create_id(rdma->channel, &listen_id, NULL, RDMA_PS_TCP);
26424a102179SMarkus Armbruster     if (ret < 0) {
26438fd471bdSMarkus Armbruster         error_setg(errp, "RDMA ERROR: could not create cm_id!");
2644329c9b10SDr. David Alan Gilbert         goto err_dest_init_create_listen_id;
2645329c9b10SDr. David Alan Gilbert     }
2646329c9b10SDr. David Alan Gilbert 
2647329c9b10SDr. David Alan Gilbert     snprintf(port_str, 16, "%d", rdma->port);
2648329c9b10SDr. David Alan Gilbert     port_str[15] = '\0';
2649329c9b10SDr. David Alan Gilbert 
2650329c9b10SDr. David Alan Gilbert     ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
265107249822SMarkus Armbruster     if (ret) {
26528fd471bdSMarkus Armbruster         error_setg(errp, "RDMA ERROR: could not rdma_getaddrinfo address %s",
26538fd471bdSMarkus Armbruster                    rdma->host);
2654329c9b10SDr. David Alan Gilbert         goto err_dest_init_bind_addr;
2655329c9b10SDr. David Alan Gilbert     }
2656329c9b10SDr. David Alan Gilbert 
2657f736e414SJack Wang     ret = rdma_set_option(listen_id, RDMA_OPTION_ID, RDMA_OPTION_ID_REUSEADDR,
2658f736e414SJack Wang                           &reuse, sizeof reuse);
26594a102179SMarkus Armbruster     if (ret < 0) {
26608fd471bdSMarkus Armbruster         error_setg(errp, "RDMA ERROR: Error: could not set REUSEADDR option");
2661f736e414SJack Wang         goto err_dest_init_bind_addr;
2662f736e414SJack Wang     }
2663071d5ae4SMarkus Armbruster 
2664071d5ae4SMarkus Armbruster     /* Try all addresses, saving the first error in @err */
2665329c9b10SDr. David Alan Gilbert     for (e = res; e != NULL; e = e->ai_next) {
2666071d5ae4SMarkus Armbruster         Error **local_errp = err ? NULL : &err;
2667071d5ae4SMarkus Armbruster 
2668329c9b10SDr. David Alan Gilbert         inet_ntop(e->ai_family,
2669329c9b10SDr. David Alan Gilbert             &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
2670733252deSDr. David Alan Gilbert         trace_qemu_rdma_dest_init_trying(rdma->host, ip);
2671329c9b10SDr. David Alan Gilbert         ret = rdma_bind_addr(listen_id, e->ai_dst_addr);
26724a102179SMarkus Armbruster         if (ret < 0) {
26731dbd2fd9SMichael Tokarev             continue;
26741dbd2fd9SMichael Tokarev         }
2675329c9b10SDr. David Alan Gilbert         if (e->ai_family == AF_INET6) {
2676071d5ae4SMarkus Armbruster             ret = qemu_rdma_broken_ipv6_kernel(listen_id->verbs,
2677071d5ae4SMarkus Armbruster                                                local_errp);
26784a102179SMarkus Armbruster             if (ret < 0) {
2679329c9b10SDr. David Alan Gilbert                 continue;
2680329c9b10SDr. David Alan Gilbert             }
2681329c9b10SDr. David Alan Gilbert         }
2682071d5ae4SMarkus Armbruster         error_free(err);
26831dbd2fd9SMichael Tokarev         break;
2684329c9b10SDr. David Alan Gilbert     }
2685329c9b10SDr. David Alan Gilbert 
2686f53b450aSLi Zhijian     rdma_freeaddrinfo(res);
26871dbd2fd9SMichael Tokarev     if (!e) {
2688071d5ae4SMarkus Armbruster         if (err) {
2689071d5ae4SMarkus Armbruster             error_propagate(errp, err);
2690071d5ae4SMarkus Armbruster         } else {
26918fd471bdSMarkus Armbruster             error_setg(errp, "RDMA ERROR: Error: could not rdma_bind_addr!");
26928fd471bdSMarkus Armbruster         }
2693329c9b10SDr. David Alan Gilbert         goto err_dest_init_bind_addr;
2694329c9b10SDr. David Alan Gilbert     }
2695329c9b10SDr. David Alan Gilbert 
2696329c9b10SDr. David Alan Gilbert     rdma->listen_id = listen_id;
2697329c9b10SDr. David Alan Gilbert     qemu_rdma_dump_gid("dest_init", listen_id);
2698329c9b10SDr. David Alan Gilbert     return 0;
2699329c9b10SDr. David Alan Gilbert 
2700329c9b10SDr. David Alan Gilbert err_dest_init_bind_addr:
2701329c9b10SDr. David Alan Gilbert     rdma_destroy_id(listen_id);
2702329c9b10SDr. David Alan Gilbert err_dest_init_create_listen_id:
2703329c9b10SDr. David Alan Gilbert     rdma_destroy_event_channel(rdma->channel);
2704329c9b10SDr. David Alan Gilbert     rdma->channel = NULL;
2705b86c94a4SMarkus Armbruster     rdma->errored = true;
270607249822SMarkus Armbruster     return -1;
2707329c9b10SDr. David Alan Gilbert 
2708329c9b10SDr. David Alan Gilbert }
2709329c9b10SDr. David Alan Gilbert 
qemu_rdma_return_path_dest_init(RDMAContext * rdma_return_path,RDMAContext * rdma)271055cc1b59SLidong Chen static void qemu_rdma_return_path_dest_init(RDMAContext *rdma_return_path,
271155cc1b59SLidong Chen                                             RDMAContext *rdma)
271255cc1b59SLidong Chen {
2713ebdb85f9SJuan Quintela     for (int i = 0; i < RDMA_WRID_MAX; i++) {
2714ebdb85f9SJuan Quintela         rdma_return_path->wr_data[i].control_len = 0;
2715ebdb85f9SJuan Quintela         rdma_return_path->wr_data[i].control_curr = NULL;
271655cc1b59SLidong Chen     }
271755cc1b59SLidong Chen 
271855cc1b59SLidong Chen     /*the CM channel and CM id is shared*/
271955cc1b59SLidong Chen     rdma_return_path->channel = rdma->channel;
272055cc1b59SLidong Chen     rdma_return_path->listen_id = rdma->listen_id;
272155cc1b59SLidong Chen 
272255cc1b59SLidong Chen     rdma->return_path = rdma_return_path;
272355cc1b59SLidong Chen     rdma_return_path->return_path = rdma;
272455cc1b59SLidong Chen     rdma_return_path->is_return_path = true;
272555cc1b59SLidong Chen }
272655cc1b59SLidong Chen 
qemu_rdma_data_init(InetSocketAddress * saddr,Error ** errp)27273fa9642fSHet Gala static RDMAContext *qemu_rdma_data_init(InetSocketAddress *saddr, Error **errp)
2728329c9b10SDr. David Alan Gilbert {
2729329c9b10SDr. David Alan Gilbert     RDMAContext *rdma = NULL;
2730329c9b10SDr. David Alan Gilbert 
273197f3ad35SMarkus Armbruster     rdma = g_new0(RDMAContext, 1);
2732329c9b10SDr. David Alan Gilbert     rdma->current_index = -1;
2733329c9b10SDr. David Alan Gilbert     rdma->current_chunk = -1;
2734329c9b10SDr. David Alan Gilbert 
27353fa9642fSHet Gala     rdma->host = g_strdup(saddr->host);
27363fa9642fSHet Gala     rdma->port = atoi(saddr->port);
2737329c9b10SDr. David Alan Gilbert     return rdma;
2738329c9b10SDr. David Alan Gilbert }
2739329c9b10SDr. David Alan Gilbert 
2740329c9b10SDr. David Alan Gilbert /*
2741329c9b10SDr. David Alan Gilbert  * QEMUFile interface to the control channel.
2742329c9b10SDr. David Alan Gilbert  * SEND messages for control only.
2743329c9b10SDr. David Alan Gilbert  * VM's ram is handled with regular RDMA messages.
2744329c9b10SDr. David Alan Gilbert  */
qio_channel_rdma_writev(QIOChannel * ioc,const struct iovec * iov,size_t niov,int * fds,size_t nfds,int flags,Error ** errp)27456ddd2d76SDaniel P. Berrange static ssize_t qio_channel_rdma_writev(QIOChannel *ioc,
27466ddd2d76SDaniel P. Berrange                                        const struct iovec *iov,
27476ddd2d76SDaniel P. Berrange                                        size_t niov,
27486ddd2d76SDaniel P. Berrange                                        int *fds,
27496ddd2d76SDaniel P. Berrange                                        size_t nfds,
2750b88651cbSLeonardo Bras                                        int flags,
27516ddd2d76SDaniel P. Berrange                                        Error **errp)
2752329c9b10SDr. David Alan Gilbert {
27536ddd2d76SDaniel P. Berrange     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
275474637e6fSLidong Chen     RDMAContext *rdma;
2755329c9b10SDr. David Alan Gilbert     int ret;
27566ddd2d76SDaniel P. Berrange     ssize_t done = 0;
275714e2fcbbSJuan Quintela     size_t len;
2758329c9b10SDr. David Alan Gilbert 
2759987ab2a5SDr. David Alan Gilbert     RCU_READ_LOCK_GUARD();
2760d73415a3SStefan Hajnoczi     rdma = qatomic_rcu_read(&rioc->rdmaout);
276174637e6fSLidong Chen 
276274637e6fSLidong Chen     if (!rdma) {
276374ecf6acSFiona Ebner         error_setg(errp, "RDMA control channel output is not set");
276474ecf6acSFiona Ebner         return -1;
276574637e6fSLidong Chen     }
276674637e6fSLidong Chen 
2767b86c94a4SMarkus Armbruster     if (rdma->errored) {
27688e262e0bSMarkus Armbruster         error_setg(errp,
27698e262e0bSMarkus Armbruster                    "RDMA is in an error state waiting migration to abort!");
27708e262e0bSMarkus Armbruster         return -1;
27718e262e0bSMarkus Armbruster     }
2772329c9b10SDr. David Alan Gilbert 
2773329c9b10SDr. David Alan Gilbert     /*
2774329c9b10SDr. David Alan Gilbert      * Push out any writes that
2775329c9b10SDr. David Alan Gilbert      * we're queued up for VM's ram.
2776329c9b10SDr. David Alan Gilbert      */
277756095477SMarkus Armbruster     ret = qemu_rdma_write_flush(rdma, errp);
2778329c9b10SDr. David Alan Gilbert     if (ret < 0) {
2779b86c94a4SMarkus Armbruster         rdma->errored = true;
278074ecf6acSFiona Ebner         return -1;
2781329c9b10SDr. David Alan Gilbert     }
2782329c9b10SDr. David Alan Gilbert 
278314e2fcbbSJuan Quintela     for (int i = 0; i < niov; i++) {
27846ddd2d76SDaniel P. Berrange         size_t remaining = iov[i].iov_len;
27856ddd2d76SDaniel P. Berrange         uint8_t * data = (void *)iov[i].iov_base;
2786329c9b10SDr. David Alan Gilbert         while (remaining) {
27872ada4b63SLi Zhijian             RDMAControlHeader head = {};
2788329c9b10SDr. David Alan Gilbert 
2789f38f6d41SLidong Chen             len = MIN(remaining, RDMA_SEND_INCREMENT);
2790f38f6d41SLidong Chen             remaining -= len;
2791329c9b10SDr. David Alan Gilbert 
2792f38f6d41SLidong Chen             head.len = len;
2793329c9b10SDr. David Alan Gilbert             head.type = RDMA_CONTROL_QEMU_FILE;
2794329c9b10SDr. David Alan Gilbert 
2795c4c78dceSMarkus Armbruster             ret = qemu_rdma_exchange_send(rdma, &head,
2796c4c78dceSMarkus Armbruster                                           data, NULL, NULL, NULL, errp);
2797329c9b10SDr. David Alan Gilbert 
2798329c9b10SDr. David Alan Gilbert             if (ret < 0) {
2799b86c94a4SMarkus Armbruster                 rdma->errored = true;
280074ecf6acSFiona Ebner                 return -1;
2801329c9b10SDr. David Alan Gilbert             }
2802329c9b10SDr. David Alan Gilbert 
2803f38f6d41SLidong Chen             data += len;
2804f38f6d41SLidong Chen             done += len;
28056ddd2d76SDaniel P. Berrange         }
2806329c9b10SDr. David Alan Gilbert     }
2807329c9b10SDr. David Alan Gilbert 
28086ddd2d76SDaniel P. Berrange     return done;
2809329c9b10SDr. David Alan Gilbert }
2810329c9b10SDr. David Alan Gilbert 
qemu_rdma_fill(RDMAContext * rdma,uint8_t * buf,size_t size,int idx)2811329c9b10SDr. David Alan Gilbert static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf,
2812a202a4c0SDr. David Alan Gilbert                              size_t size, int idx)
2813329c9b10SDr. David Alan Gilbert {
2814329c9b10SDr. David Alan Gilbert     size_t len = 0;
2815329c9b10SDr. David Alan Gilbert 
2816329c9b10SDr. David Alan Gilbert     if (rdma->wr_data[idx].control_len) {
2817733252deSDr. David Alan Gilbert         trace_qemu_rdma_fill(rdma->wr_data[idx].control_len, size);
2818329c9b10SDr. David Alan Gilbert 
2819329c9b10SDr. David Alan Gilbert         len = MIN(size, rdma->wr_data[idx].control_len);
2820329c9b10SDr. David Alan Gilbert         memcpy(buf, rdma->wr_data[idx].control_curr, len);
2821329c9b10SDr. David Alan Gilbert         rdma->wr_data[idx].control_curr += len;
2822329c9b10SDr. David Alan Gilbert         rdma->wr_data[idx].control_len -= len;
2823329c9b10SDr. David Alan Gilbert     }
2824329c9b10SDr. David Alan Gilbert 
2825329c9b10SDr. David Alan Gilbert     return len;
2826329c9b10SDr. David Alan Gilbert }
2827329c9b10SDr. David Alan Gilbert 
2828329c9b10SDr. David Alan Gilbert /*
2829329c9b10SDr. David Alan Gilbert  * QEMUFile interface to the control channel.
2830329c9b10SDr. David Alan Gilbert  * RDMA links don't use bytestreams, so we have to
2831329c9b10SDr. David Alan Gilbert  * return bytes to QEMUFile opportunistically.
2832329c9b10SDr. David Alan Gilbert  */
qio_channel_rdma_readv(QIOChannel * ioc,const struct iovec * iov,size_t niov,int ** fds,size_t * nfds,int flags,Error ** errp)28336ddd2d76SDaniel P. Berrange static ssize_t qio_channel_rdma_readv(QIOChannel *ioc,
28346ddd2d76SDaniel P. Berrange                                       const struct iovec *iov,
28356ddd2d76SDaniel P. Berrange                                       size_t niov,
28366ddd2d76SDaniel P. Berrange                                       int **fds,
28376ddd2d76SDaniel P. Berrange                                       size_t *nfds,
283884615a19Smanish.mishra                                       int flags,
28396ddd2d76SDaniel P. Berrange                                       Error **errp)
2840329c9b10SDr. David Alan Gilbert {
28416ddd2d76SDaniel P. Berrange     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
284274637e6fSLidong Chen     RDMAContext *rdma;
2843329c9b10SDr. David Alan Gilbert     RDMAControlHeader head;
2844c0d77702SMarkus Armbruster     int ret;
28458ff58b05SMarkus Armbruster     ssize_t done = 0;
284614e2fcbbSJuan Quintela     size_t len;
2847329c9b10SDr. David Alan Gilbert 
2848987ab2a5SDr. David Alan Gilbert     RCU_READ_LOCK_GUARD();
2849d73415a3SStefan Hajnoczi     rdma = qatomic_rcu_read(&rioc->rdmain);
285074637e6fSLidong Chen 
285174637e6fSLidong Chen     if (!rdma) {
285274ecf6acSFiona Ebner         error_setg(errp, "RDMA control channel input is not set");
285374ecf6acSFiona Ebner         return -1;
285474637e6fSLidong Chen     }
285574637e6fSLidong Chen 
2856b86c94a4SMarkus Armbruster     if (rdma->errored) {
28578e262e0bSMarkus Armbruster         error_setg(errp,
28588e262e0bSMarkus Armbruster                    "RDMA is in an error state waiting migration to abort!");
28598e262e0bSMarkus Armbruster         return -1;
28608e262e0bSMarkus Armbruster     }
2861329c9b10SDr. David Alan Gilbert 
286214e2fcbbSJuan Quintela     for (int i = 0; i < niov; i++) {
28636ddd2d76SDaniel P. Berrange         size_t want = iov[i].iov_len;
28646ddd2d76SDaniel P. Berrange         uint8_t *data = (void *)iov[i].iov_base;
28656ddd2d76SDaniel P. Berrange 
2866329c9b10SDr. David Alan Gilbert         /*
2867329c9b10SDr. David Alan Gilbert          * First, we hold on to the last SEND message we
2868329c9b10SDr. David Alan Gilbert          * were given and dish out the bytes until we run
2869329c9b10SDr. David Alan Gilbert          * out of bytes.
2870329c9b10SDr. David Alan Gilbert          */
287125352b37SMarkus Armbruster         len = qemu_rdma_fill(rdma, data, want, 0);
287225352b37SMarkus Armbruster         done += len;
287325352b37SMarkus Armbruster         want -= len;
28746ddd2d76SDaniel P. Berrange         /* Got what we needed, so go to next iovec */
28756ddd2d76SDaniel P. Berrange         if (want == 0) {
28766ddd2d76SDaniel P. Berrange             continue;
2877329c9b10SDr. David Alan Gilbert         }
2878329c9b10SDr. David Alan Gilbert 
28796ddd2d76SDaniel P. Berrange         /* If we got any data so far, then don't wait
28806ddd2d76SDaniel P. Berrange          * for more, just return what we have */
28816ddd2d76SDaniel P. Berrange         if (done > 0) {
28826ddd2d76SDaniel P. Berrange             break;
28836ddd2d76SDaniel P. Berrange         }
28846ddd2d76SDaniel P. Berrange 
28856ddd2d76SDaniel P. Berrange 
28866ddd2d76SDaniel P. Berrange         /* We've got nothing at all, so lets wait for
28876ddd2d76SDaniel P. Berrange          * more to arrive
2888329c9b10SDr. David Alan Gilbert          */
288996f363d8SMarkus Armbruster         ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_QEMU_FILE,
289096f363d8SMarkus Armbruster                                       errp);
2891329c9b10SDr. David Alan Gilbert 
2892329c9b10SDr. David Alan Gilbert         if (ret < 0) {
2893b86c94a4SMarkus Armbruster             rdma->errored = true;
289474ecf6acSFiona Ebner             return -1;
2895329c9b10SDr. David Alan Gilbert         }
2896329c9b10SDr. David Alan Gilbert 
2897329c9b10SDr. David Alan Gilbert         /*
2898329c9b10SDr. David Alan Gilbert          * SEND was received with new bytes, now try again.
2899329c9b10SDr. David Alan Gilbert          */
290025352b37SMarkus Armbruster         len = qemu_rdma_fill(rdma, data, want, 0);
290125352b37SMarkus Armbruster         done += len;
290225352b37SMarkus Armbruster         want -= len;
29036ddd2d76SDaniel P. Berrange 
29046ddd2d76SDaniel P. Berrange         /* Still didn't get enough, so lets just return */
29056ddd2d76SDaniel P. Berrange         if (want) {
29066ddd2d76SDaniel P. Berrange             if (done == 0) {
29076ddd2d76SDaniel P. Berrange                 return QIO_CHANNEL_ERR_BLOCK;
29086ddd2d76SDaniel P. Berrange             } else {
29096ddd2d76SDaniel P. Berrange                 break;
29106ddd2d76SDaniel P. Berrange             }
29116ddd2d76SDaniel P. Berrange         }
29126ddd2d76SDaniel P. Berrange     }
2913f38f6d41SLidong Chen     return done;
2914329c9b10SDr. David Alan Gilbert }
2915329c9b10SDr. David Alan Gilbert 
2916329c9b10SDr. David Alan Gilbert /*
2917329c9b10SDr. David Alan Gilbert  * Block until all the outstanding chunks have been delivered by the hardware.
2918329c9b10SDr. David Alan Gilbert  */
qemu_rdma_drain_cq(RDMAContext * rdma)2919e3378035SJuan Quintela static int qemu_rdma_drain_cq(RDMAContext *rdma)
2920329c9b10SDr. David Alan Gilbert {
292156095477SMarkus Armbruster     Error *err = NULL;
2922329c9b10SDr. David Alan Gilbert 
292356095477SMarkus Armbruster     if (qemu_rdma_write_flush(rdma, &err) < 0) {
292456095477SMarkus Armbruster         error_report_err(err);
29258c6513f7SMarkus Armbruster         return -1;
2926329c9b10SDr. David Alan Gilbert     }
2927329c9b10SDr. David Alan Gilbert 
2928329c9b10SDr. David Alan Gilbert     while (rdma->nb_sent) {
29298f5a7faaSJuan Quintela         if (qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL) < 0) {
2930733252deSDr. David Alan Gilbert             error_report("rdma migration: complete polling error!");
29318c6513f7SMarkus Armbruster             return -1;
2932329c9b10SDr. David Alan Gilbert         }
2933329c9b10SDr. David Alan Gilbert     }
2934329c9b10SDr. David Alan Gilbert 
2935329c9b10SDr. David Alan Gilbert     qemu_rdma_unregister_waiting(rdma);
2936329c9b10SDr. David Alan Gilbert 
2937329c9b10SDr. David Alan Gilbert     return 0;
2938329c9b10SDr. David Alan Gilbert }
2939329c9b10SDr. David Alan Gilbert 
29406ddd2d76SDaniel P. Berrange 
qio_channel_rdma_set_blocking(QIOChannel * ioc,bool blocking,Error ** errp)29416ddd2d76SDaniel P. Berrange static int qio_channel_rdma_set_blocking(QIOChannel *ioc,
29426ddd2d76SDaniel P. Berrange                                          bool blocking,
29436ddd2d76SDaniel P. Berrange                                          Error **errp)
2944329c9b10SDr. David Alan Gilbert {
29456ddd2d76SDaniel P. Berrange     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
29466ddd2d76SDaniel P. Berrange     /* XXX we should make readv/writev actually honour this :-) */
29476ddd2d76SDaniel P. Berrange     rioc->blocking = blocking;
29486ddd2d76SDaniel P. Berrange     return 0;
2949329c9b10SDr. David Alan Gilbert }
29506ddd2d76SDaniel P. Berrange 
29516ddd2d76SDaniel P. Berrange 
29526ddd2d76SDaniel P. Berrange typedef struct QIOChannelRDMASource QIOChannelRDMASource;
29536ddd2d76SDaniel P. Berrange struct QIOChannelRDMASource {
29546ddd2d76SDaniel P. Berrange     GSource parent;
29556ddd2d76SDaniel P. Berrange     QIOChannelRDMA *rioc;
29566ddd2d76SDaniel P. Berrange     GIOCondition condition;
29576ddd2d76SDaniel P. Berrange };
29586ddd2d76SDaniel P. Berrange 
29596ddd2d76SDaniel P. Berrange static gboolean
qio_channel_rdma_source_prepare(GSource * source,gint * timeout)29606ddd2d76SDaniel P. Berrange qio_channel_rdma_source_prepare(GSource *source,
29616ddd2d76SDaniel P. Berrange                                 gint *timeout)
29626ddd2d76SDaniel P. Berrange {
29636ddd2d76SDaniel P. Berrange     QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
296474637e6fSLidong Chen     RDMAContext *rdma;
29656ddd2d76SDaniel P. Berrange     GIOCondition cond = 0;
29666ddd2d76SDaniel P. Berrange     *timeout = -1;
29676ddd2d76SDaniel P. Berrange 
2968987ab2a5SDr. David Alan Gilbert     RCU_READ_LOCK_GUARD();
296974637e6fSLidong Chen     if (rsource->condition == G_IO_IN) {
2970d73415a3SStefan Hajnoczi         rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
297174637e6fSLidong Chen     } else {
2972d73415a3SStefan Hajnoczi         rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
297374637e6fSLidong Chen     }
297474637e6fSLidong Chen 
297574637e6fSLidong Chen     if (!rdma) {
297674637e6fSLidong Chen         error_report("RDMAContext is NULL when prepare Gsource");
297774637e6fSLidong Chen         return FALSE;
297874637e6fSLidong Chen     }
297974637e6fSLidong Chen 
29806ddd2d76SDaniel P. Berrange     if (rdma->wr_data[0].control_len) {
29816ddd2d76SDaniel P. Berrange         cond |= G_IO_IN;
29826ddd2d76SDaniel P. Berrange     }
29836ddd2d76SDaniel P. Berrange     cond |= G_IO_OUT;
29846ddd2d76SDaniel P. Berrange 
29856ddd2d76SDaniel P. Berrange     return cond & rsource->condition;
29866ddd2d76SDaniel P. Berrange }
29876ddd2d76SDaniel P. Berrange 
29886ddd2d76SDaniel P. Berrange static gboolean
qio_channel_rdma_source_check(GSource * source)29896ddd2d76SDaniel P. Berrange qio_channel_rdma_source_check(GSource *source)
29906ddd2d76SDaniel P. Berrange {
29916ddd2d76SDaniel P. Berrange     QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
299274637e6fSLidong Chen     RDMAContext *rdma;
29936ddd2d76SDaniel P. Berrange     GIOCondition cond = 0;
29946ddd2d76SDaniel P. Berrange 
2995987ab2a5SDr. David Alan Gilbert     RCU_READ_LOCK_GUARD();
299674637e6fSLidong Chen     if (rsource->condition == G_IO_IN) {
2997d73415a3SStefan Hajnoczi         rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
299874637e6fSLidong Chen     } else {
2999d73415a3SStefan Hajnoczi         rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
300074637e6fSLidong Chen     }
300174637e6fSLidong Chen 
300274637e6fSLidong Chen     if (!rdma) {
300374637e6fSLidong Chen         error_report("RDMAContext is NULL when check Gsource");
300474637e6fSLidong Chen         return FALSE;
300574637e6fSLidong Chen     }
300674637e6fSLidong Chen 
30076ddd2d76SDaniel P. Berrange     if (rdma->wr_data[0].control_len) {
30086ddd2d76SDaniel P. Berrange         cond |= G_IO_IN;
30096ddd2d76SDaniel P. Berrange     }
30106ddd2d76SDaniel P. Berrange     cond |= G_IO_OUT;
30116ddd2d76SDaniel P. Berrange 
30126ddd2d76SDaniel P. Berrange     return cond & rsource->condition;
30136ddd2d76SDaniel P. Berrange }
30146ddd2d76SDaniel P. Berrange 
30156ddd2d76SDaniel P. Berrange static gboolean
qio_channel_rdma_source_dispatch(GSource * source,GSourceFunc callback,gpointer user_data)30166ddd2d76SDaniel P. Berrange qio_channel_rdma_source_dispatch(GSource *source,
30176ddd2d76SDaniel P. Berrange                                  GSourceFunc callback,
30186ddd2d76SDaniel P. Berrange                                  gpointer user_data)
30196ddd2d76SDaniel P. Berrange {
30206ddd2d76SDaniel P. Berrange     QIOChannelFunc func = (QIOChannelFunc)callback;
30216ddd2d76SDaniel P. Berrange     QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
302274637e6fSLidong Chen     RDMAContext *rdma;
30236ddd2d76SDaniel P. Berrange     GIOCondition cond = 0;
30246ddd2d76SDaniel P. Berrange 
3025987ab2a5SDr. David Alan Gilbert     RCU_READ_LOCK_GUARD();
302674637e6fSLidong Chen     if (rsource->condition == G_IO_IN) {
3027d73415a3SStefan Hajnoczi         rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
302874637e6fSLidong Chen     } else {
3029d73415a3SStefan Hajnoczi         rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
303074637e6fSLidong Chen     }
303174637e6fSLidong Chen 
303274637e6fSLidong Chen     if (!rdma) {
303374637e6fSLidong Chen         error_report("RDMAContext is NULL when dispatch Gsource");
303474637e6fSLidong Chen         return FALSE;
303574637e6fSLidong Chen     }
303674637e6fSLidong Chen 
30376ddd2d76SDaniel P. Berrange     if (rdma->wr_data[0].control_len) {
30386ddd2d76SDaniel P. Berrange         cond |= G_IO_IN;
30396ddd2d76SDaniel P. Berrange     }
30406ddd2d76SDaniel P. Berrange     cond |= G_IO_OUT;
30416ddd2d76SDaniel P. Berrange 
30426ddd2d76SDaniel P. Berrange     return (*func)(QIO_CHANNEL(rsource->rioc),
30436ddd2d76SDaniel P. Berrange                    (cond & rsource->condition),
30446ddd2d76SDaniel P. Berrange                    user_data);
30456ddd2d76SDaniel P. Berrange }
30466ddd2d76SDaniel P. Berrange 
30476ddd2d76SDaniel P. Berrange static void
qio_channel_rdma_source_finalize(GSource * source)30486ddd2d76SDaniel P. Berrange qio_channel_rdma_source_finalize(GSource *source)
30496ddd2d76SDaniel P. Berrange {
30506ddd2d76SDaniel P. Berrange     QIOChannelRDMASource *ssource = (QIOChannelRDMASource *)source;
30516ddd2d76SDaniel P. Berrange 
30526ddd2d76SDaniel P. Berrange     object_unref(OBJECT(ssource->rioc));
30536ddd2d76SDaniel P. Berrange }
30546ddd2d76SDaniel P. Berrange 
305536cc822dSMarkus Armbruster static GSourceFuncs qio_channel_rdma_source_funcs = {
30566ddd2d76SDaniel P. Berrange     qio_channel_rdma_source_prepare,
30576ddd2d76SDaniel P. Berrange     qio_channel_rdma_source_check,
30586ddd2d76SDaniel P. Berrange     qio_channel_rdma_source_dispatch,
30596ddd2d76SDaniel P. Berrange     qio_channel_rdma_source_finalize
30606ddd2d76SDaniel P. Berrange };
30616ddd2d76SDaniel P. Berrange 
qio_channel_rdma_create_watch(QIOChannel * ioc,GIOCondition condition)30626ddd2d76SDaniel P. Berrange static GSource *qio_channel_rdma_create_watch(QIOChannel *ioc,
30636ddd2d76SDaniel P. Berrange                                               GIOCondition condition)
30646ddd2d76SDaniel P. Berrange {
30656ddd2d76SDaniel P. Berrange     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
30666ddd2d76SDaniel P. Berrange     QIOChannelRDMASource *ssource;
30676ddd2d76SDaniel P. Berrange     GSource *source;
30686ddd2d76SDaniel P. Berrange 
30696ddd2d76SDaniel P. Berrange     source = g_source_new(&qio_channel_rdma_source_funcs,
30706ddd2d76SDaniel P. Berrange                           sizeof(QIOChannelRDMASource));
30716ddd2d76SDaniel P. Berrange     ssource = (QIOChannelRDMASource *)source;
30726ddd2d76SDaniel P. Berrange 
30736ddd2d76SDaniel P. Berrange     ssource->rioc = rioc;
30746ddd2d76SDaniel P. Berrange     object_ref(OBJECT(rioc));
30756ddd2d76SDaniel P. Berrange 
30766ddd2d76SDaniel P. Berrange     ssource->condition = condition;
30776ddd2d76SDaniel P. Berrange 
30786ddd2d76SDaniel P. Berrange     return source;
30796ddd2d76SDaniel P. Berrange }
30806ddd2d76SDaniel P. Berrange 
qio_channel_rdma_set_aio_fd_handler(QIOChannel * ioc,AioContext * read_ctx,IOHandler * io_read,AioContext * write_ctx,IOHandler * io_write,void * opaque)30814d9f675bSLidong Chen static void qio_channel_rdma_set_aio_fd_handler(QIOChannel *ioc,
308206e0f098SStefan Hajnoczi                                                 AioContext *read_ctx,
30834d9f675bSLidong Chen                                                 IOHandler *io_read,
308406e0f098SStefan Hajnoczi                                                 AioContext *write_ctx,
30854d9f675bSLidong Chen                                                 IOHandler *io_write,
30864d9f675bSLidong Chen                                                 void *opaque)
30874d9f675bSLidong Chen {
30884d9f675bSLidong Chen     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
30894d9f675bSLidong Chen     if (io_read) {
309006e0f098SStefan Hajnoczi         aio_set_fd_handler(read_ctx, rioc->rdmain->recv_comp_channel->fd,
309106e0f098SStefan Hajnoczi                            io_read, io_write, NULL, NULL, opaque);
309206e0f098SStefan Hajnoczi         aio_set_fd_handler(read_ctx, rioc->rdmain->send_comp_channel->fd,
309306e0f098SStefan Hajnoczi                            io_read, io_write, NULL, NULL, opaque);
30944d9f675bSLidong Chen     } else {
309506e0f098SStefan Hajnoczi         aio_set_fd_handler(write_ctx, rioc->rdmaout->recv_comp_channel->fd,
309606e0f098SStefan Hajnoczi                            io_read, io_write, NULL, NULL, opaque);
309706e0f098SStefan Hajnoczi         aio_set_fd_handler(write_ctx, rioc->rdmaout->send_comp_channel->fd,
309806e0f098SStefan Hajnoczi                            io_read, io_write, NULL, NULL, opaque);
30994d9f675bSLidong Chen     }
31004d9f675bSLidong Chen }
31016ddd2d76SDaniel P. Berrange 
3102d46a4847SDr. David Alan Gilbert struct rdma_close_rcu {
3103d46a4847SDr. David Alan Gilbert     struct rcu_head rcu;
3104d46a4847SDr. David Alan Gilbert     RDMAContext *rdmain;
3105d46a4847SDr. David Alan Gilbert     RDMAContext *rdmaout;
3106d46a4847SDr. David Alan Gilbert };
3107d46a4847SDr. David Alan Gilbert 
3108d46a4847SDr. David Alan Gilbert /* callback from qio_channel_rdma_close via call_rcu */
qio_channel_rdma_close_rcu(struct rdma_close_rcu * rcu)3109d46a4847SDr. David Alan Gilbert static void qio_channel_rdma_close_rcu(struct rdma_close_rcu *rcu)
3110d46a4847SDr. David Alan Gilbert {
3111d46a4847SDr. David Alan Gilbert     if (rcu->rdmain) {
3112d46a4847SDr. David Alan Gilbert         qemu_rdma_cleanup(rcu->rdmain);
3113d46a4847SDr. David Alan Gilbert     }
3114d46a4847SDr. David Alan Gilbert 
3115d46a4847SDr. David Alan Gilbert     if (rcu->rdmaout) {
3116d46a4847SDr. David Alan Gilbert         qemu_rdma_cleanup(rcu->rdmaout);
3117d46a4847SDr. David Alan Gilbert     }
3118d46a4847SDr. David Alan Gilbert 
3119d46a4847SDr. David Alan Gilbert     g_free(rcu->rdmain);
3120d46a4847SDr. David Alan Gilbert     g_free(rcu->rdmaout);
3121d46a4847SDr. David Alan Gilbert     g_free(rcu);
3122d46a4847SDr. David Alan Gilbert }
3123d46a4847SDr. David Alan Gilbert 
qio_channel_rdma_close(QIOChannel * ioc,Error ** errp)31246ddd2d76SDaniel P. Berrange static int qio_channel_rdma_close(QIOChannel *ioc,
31256ddd2d76SDaniel P. Berrange                                   Error **errp)
31266ddd2d76SDaniel P. Berrange {
31276ddd2d76SDaniel P. Berrange     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
312874637e6fSLidong Chen     RDMAContext *rdmain, *rdmaout;
3129d46a4847SDr. David Alan Gilbert     struct rdma_close_rcu *rcu = g_new(struct rdma_close_rcu, 1);
3130d46a4847SDr. David Alan Gilbert 
31316ddd2d76SDaniel P. Berrange     trace_qemu_rdma_close();
313274637e6fSLidong Chen 
313374637e6fSLidong Chen     rdmain = rioc->rdmain;
313474637e6fSLidong Chen     if (rdmain) {
3135d73415a3SStefan Hajnoczi         qatomic_rcu_set(&rioc->rdmain, NULL);
313612c67ffbSDr. David Alan Gilbert     }
313774637e6fSLidong Chen 
313874637e6fSLidong Chen     rdmaout = rioc->rdmaout;
313974637e6fSLidong Chen     if (rdmaout) {
3140d73415a3SStefan Hajnoczi         qatomic_rcu_set(&rioc->rdmaout, NULL);
31416ddd2d76SDaniel P. Berrange     }
314274637e6fSLidong Chen 
3143d46a4847SDr. David Alan Gilbert     rcu->rdmain = rdmain;
3144d46a4847SDr. David Alan Gilbert     rcu->rdmaout = rdmaout;
3145d46a4847SDr. David Alan Gilbert     call_rcu(rcu, qio_channel_rdma_close_rcu, rcu);
314674637e6fSLidong Chen 
3147329c9b10SDr. David Alan Gilbert     return 0;
3148329c9b10SDr. David Alan Gilbert }
3149329c9b10SDr. David Alan Gilbert 
315054db882fSLidong Chen static int
qio_channel_rdma_shutdown(QIOChannel * ioc,QIOChannelShutdown how,Error ** errp)315154db882fSLidong Chen qio_channel_rdma_shutdown(QIOChannel *ioc,
315254db882fSLidong Chen                             QIOChannelShutdown how,
315354db882fSLidong Chen                             Error **errp)
315454db882fSLidong Chen {
315554db882fSLidong Chen     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
315654db882fSLidong Chen     RDMAContext *rdmain, *rdmaout;
315754db882fSLidong Chen 
3158987ab2a5SDr. David Alan Gilbert     RCU_READ_LOCK_GUARD();
315954db882fSLidong Chen 
3160d73415a3SStefan Hajnoczi     rdmain = qatomic_rcu_read(&rioc->rdmain);
3161d73415a3SStefan Hajnoczi     rdmaout = qatomic_rcu_read(&rioc->rdmain);
316254db882fSLidong Chen 
316354db882fSLidong Chen     switch (how) {
316454db882fSLidong Chen     case QIO_CHANNEL_SHUTDOWN_READ:
316554db882fSLidong Chen         if (rdmain) {
3166b86c94a4SMarkus Armbruster             rdmain->errored = true;
316754db882fSLidong Chen         }
316854db882fSLidong Chen         break;
316954db882fSLidong Chen     case QIO_CHANNEL_SHUTDOWN_WRITE:
317054db882fSLidong Chen         if (rdmaout) {
3171b86c94a4SMarkus Armbruster             rdmaout->errored = true;
317254db882fSLidong Chen         }
317354db882fSLidong Chen         break;
317454db882fSLidong Chen     case QIO_CHANNEL_SHUTDOWN_BOTH:
317554db882fSLidong Chen     default:
317654db882fSLidong Chen         if (rdmain) {
3177b86c94a4SMarkus Armbruster             rdmain->errored = true;
317854db882fSLidong Chen         }
317954db882fSLidong Chen         if (rdmaout) {
3180b86c94a4SMarkus Armbruster             rdmaout->errored = true;
318154db882fSLidong Chen         }
318254db882fSLidong Chen         break;
318354db882fSLidong Chen     }
318454db882fSLidong Chen 
318554db882fSLidong Chen     return 0;
318654db882fSLidong Chen }
318754db882fSLidong Chen 
3188329c9b10SDr. David Alan Gilbert /*
3189329c9b10SDr. David Alan Gilbert  * Parameters:
3190329c9b10SDr. David Alan Gilbert  *    @offset == 0 :
3191329c9b10SDr. David Alan Gilbert  *        This means that 'block_offset' is a full virtual address that does not
3192329c9b10SDr. David Alan Gilbert  *        belong to a RAMBlock of the virtual machine and instead
3193329c9b10SDr. David Alan Gilbert  *        represents a private malloc'd memory area that the caller wishes to
3194329c9b10SDr. David Alan Gilbert  *        transfer.
3195329c9b10SDr. David Alan Gilbert  *
3196329c9b10SDr. David Alan Gilbert  *    @offset != 0 :
3197329c9b10SDr. David Alan Gilbert  *        Offset is an offset to be added to block_offset and used
3198329c9b10SDr. David Alan Gilbert  *        to also lookup the corresponding RAMBlock.
3199329c9b10SDr. David Alan Gilbert  *
3200246683c2SDaniel P. Berrangé  *    @size : Number of bytes to transfer
3201329c9b10SDr. David Alan Gilbert  *
32029c53d369SJuan Quintela  *    @pages_sent : User-specificed pointer to indicate how many pages were
3203329c9b10SDr. David Alan Gilbert  *                  sent. Usually, this will not be more than a few bytes of
3204329c9b10SDr. David Alan Gilbert  *                  the protocol because most transfers are sent asynchronously.
3205329c9b10SDr. David Alan Gilbert  */
qemu_rdma_save_page(QEMUFile * f,ram_addr_t block_offset,ram_addr_t offset,size_t size)32069c53d369SJuan Quintela static int qemu_rdma_save_page(QEMUFile *f, ram_addr_t block_offset,
32079c53d369SJuan Quintela                                ram_addr_t offset, size_t size)
3208329c9b10SDr. David Alan Gilbert {
3209365c0463SDaniel P. Berrangé     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
3210446e559cSMarkus Armbruster     Error *err = NULL;
321174637e6fSLidong Chen     RDMAContext *rdma;
3212329c9b10SDr. David Alan Gilbert     int ret;
3213329c9b10SDr. David Alan Gilbert 
3214987ab2a5SDr. David Alan Gilbert     RCU_READ_LOCK_GUARD();
3215d73415a3SStefan Hajnoczi     rdma = qatomic_rcu_read(&rioc->rdmaout);
321674637e6fSLidong Chen 
321774637e6fSLidong Chen     if (!rdma) {
32180110c6b8SMarkus Armbruster         return -1;
321974637e6fSLidong Chen     }
322074637e6fSLidong Chen 
3221b86c94a4SMarkus Armbruster     if (rdma_errored(rdma)) {
32220110c6b8SMarkus Armbruster         return -1;
3223de3e05e8SMarkus Armbruster     }
3224329c9b10SDr. David Alan Gilbert 
3225329c9b10SDr. David Alan Gilbert     qemu_fflush(f);
3226329c9b10SDr. David Alan Gilbert 
3227329c9b10SDr. David Alan Gilbert     /*
3228329c9b10SDr. David Alan Gilbert      * Add this page to the current 'chunk'. If the chunk
32293a4452d8Szhaolichang      * is full, or the page doesn't belong to the current chunk,
3230329c9b10SDr. David Alan Gilbert      * an actual RDMA write will occur and a new chunk will be formed.
3231329c9b10SDr. David Alan Gilbert      */
3232446e559cSMarkus Armbruster     ret = qemu_rdma_write(rdma, block_offset, offset, size, &err);
3233329c9b10SDr. David Alan Gilbert     if (ret < 0) {
3234446e559cSMarkus Armbruster         error_report_err(err);
3235329c9b10SDr. David Alan Gilbert         goto err;
3236329c9b10SDr. David Alan Gilbert     }
3237329c9b10SDr. David Alan Gilbert 
3238329c9b10SDr. David Alan Gilbert     /*
3239329c9b10SDr. David Alan Gilbert      * Drain the Completion Queue if possible, but do not block,
3240329c9b10SDr. David Alan Gilbert      * just poll.
3241329c9b10SDr. David Alan Gilbert      *
3242329c9b10SDr. David Alan Gilbert      * If nothing to poll, the end of the iteration will do this
3243329c9b10SDr. David Alan Gilbert      * again to make sure we don't overflow the request queue.
3244329c9b10SDr. David Alan Gilbert      */
3245329c9b10SDr. David Alan Gilbert     while (1) {
3246329c9b10SDr. David Alan Gilbert         uint64_t wr_id, wr_id_in;
3247bbde6562SMarkus Armbruster         ret = qemu_rdma_poll(rdma, rdma->recv_cq, &wr_id_in, NULL);
3248bbde6562SMarkus Armbruster 
3249b390afd8SLi Zhijian         if (ret < 0) {
32501b6e1da6SMarkus Armbruster             error_report("rdma migration: polling error");
3251b390afd8SLi Zhijian             goto err;
3252b390afd8SLi Zhijian         }
3253b390afd8SLi Zhijian 
3254b390afd8SLi Zhijian         wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
3255b390afd8SLi Zhijian 
3256b390afd8SLi Zhijian         if (wr_id == RDMA_WRID_NONE) {
3257b390afd8SLi Zhijian             break;
3258b390afd8SLi Zhijian         }
3259b390afd8SLi Zhijian     }
3260b390afd8SLi Zhijian 
3261b390afd8SLi Zhijian     while (1) {
3262b390afd8SLi Zhijian         uint64_t wr_id, wr_id_in;
3263bbde6562SMarkus Armbruster         ret = qemu_rdma_poll(rdma, rdma->send_cq, &wr_id_in, NULL);
3264bbde6562SMarkus Armbruster 
3265329c9b10SDr. David Alan Gilbert         if (ret < 0) {
32661b6e1da6SMarkus Armbruster             error_report("rdma migration: polling error");
3267329c9b10SDr. David Alan Gilbert             goto err;
3268329c9b10SDr. David Alan Gilbert         }
3269329c9b10SDr. David Alan Gilbert 
3270329c9b10SDr. David Alan Gilbert         wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
3271329c9b10SDr. David Alan Gilbert 
3272329c9b10SDr. David Alan Gilbert         if (wr_id == RDMA_WRID_NONE) {
3273329c9b10SDr. David Alan Gilbert             break;
3274329c9b10SDr. David Alan Gilbert         }
3275329c9b10SDr. David Alan Gilbert     }
3276329c9b10SDr. David Alan Gilbert 
3277329c9b10SDr. David Alan Gilbert     return RAM_SAVE_CONTROL_DELAYED;
32780110c6b8SMarkus Armbruster 
3279329c9b10SDr. David Alan Gilbert err:
3280b86c94a4SMarkus Armbruster     rdma->errored = true;
32810110c6b8SMarkus Armbruster     return -1;
3282329c9b10SDr. David Alan Gilbert }
3283329c9b10SDr. David Alan Gilbert 
rdma_control_save_page(QEMUFile * f,ram_addr_t block_offset,ram_addr_t offset,size_t size)3284e493008dSJuan Quintela int rdma_control_save_page(QEMUFile *f, ram_addr_t block_offset,
3285e493008dSJuan Quintela                            ram_addr_t offset, size_t size)
3286e493008dSJuan Quintela {
3287a4832d29SJuan Quintela     if (!migrate_rdma() || migration_in_postcopy()) {
3288e493008dSJuan Quintela         return RAM_SAVE_CONTROL_NOT_SUPP;
3289e493008dSJuan Quintela     }
3290e493008dSJuan Quintela 
3291e493008dSJuan Quintela     int ret = qemu_rdma_save_page(f, block_offset, offset, size);
3292e493008dSJuan Quintela 
3293e493008dSJuan Quintela     if (ret != RAM_SAVE_CONTROL_DELAYED &&
3294e493008dSJuan Quintela         ret != RAM_SAVE_CONTROL_NOT_SUPP) {
3295e493008dSJuan Quintela         if (ret < 0) {
3296e493008dSJuan Quintela             qemu_file_set_error(f, ret);
3297e493008dSJuan Quintela         }
3298e493008dSJuan Quintela     }
3299e493008dSJuan Quintela     return ret;
3300e493008dSJuan Quintela }
3301e493008dSJuan Quintela 
330255cc1b59SLidong Chen static void rdma_accept_incoming_migration(void *opaque);
330355cc1b59SLidong Chen 
rdma_cm_poll_handler(void * opaque)330492370989SLidong Chen static void rdma_cm_poll_handler(void *opaque)
330592370989SLidong Chen {
330692370989SLidong Chen     RDMAContext *rdma = opaque;
330792370989SLidong Chen     struct rdma_cm_event *cm_event;
330892370989SLidong Chen     MigrationIncomingState *mis = migration_incoming_get_current();
330992370989SLidong Chen 
33108f5a7faaSJuan Quintela     if (rdma_get_cm_event(rdma->channel, &cm_event) < 0) {
331192370989SLidong Chen         error_report("get_cm_event failed %d", errno);
331292370989SLidong Chen         return;
331392370989SLidong Chen     }
331492370989SLidong Chen 
331592370989SLidong Chen     if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
331692370989SLidong Chen         cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
3317b86c94a4SMarkus Armbruster         if (!rdma->errored &&
3318de8434a3SDr. David Alan Gilbert             migration_incoming_get_current()->state !=
3319de8434a3SDr. David Alan Gilbert               MIGRATION_STATUS_COMPLETED) {
332092370989SLidong Chen             error_report("receive cm event, cm event is %d", cm_event->event);
3321b86c94a4SMarkus Armbruster             rdma->errored = true;
332292370989SLidong Chen             if (rdma->return_path) {
3323b86c94a4SMarkus Armbruster                 rdma->return_path->errored = true;
332492370989SLidong Chen             }
3325de8434a3SDr. David Alan Gilbert         }
33266b8c2eb5SLi Zhijian         rdma_ack_cm_event(cm_event);
3327dd42ce24SVladimir Sementsov-Ogievskiy         if (mis->loadvm_co) {
3328dd42ce24SVladimir Sementsov-Ogievskiy             qemu_coroutine_enter(mis->loadvm_co);
332992370989SLidong Chen         }
333092370989SLidong Chen         return;
333192370989SLidong Chen     }
33326b8c2eb5SLi Zhijian     rdma_ack_cm_event(cm_event);
333392370989SLidong Chen }
333492370989SLidong Chen 
qemu_rdma_accept(RDMAContext * rdma)3335329c9b10SDr. David Alan Gilbert static int qemu_rdma_accept(RDMAContext *rdma)
3336329c9b10SDr. David Alan Gilbert {
33373c0c3ebaSMarkus Armbruster     Error *err = NULL;
3338329c9b10SDr. David Alan Gilbert     RDMACapabilities cap;
3339329c9b10SDr. David Alan Gilbert     struct rdma_conn_param conn_param = {
3340329c9b10SDr. David Alan Gilbert                                             .responder_resources = 2,
3341329c9b10SDr. David Alan Gilbert                                             .private_data = &cap,
3342329c9b10SDr. David Alan Gilbert                                             .private_data_len = sizeof(cap),
3343329c9b10SDr. David Alan Gilbert                                          };
334444bcfd45SLi Zhijian     RDMAContext *rdma_return_path = NULL;
33453fa9642fSHet Gala     g_autoptr(InetSocketAddress) isock = g_new0(InetSocketAddress, 1);
3346329c9b10SDr. David Alan Gilbert     struct rdma_cm_event *cm_event;
3347329c9b10SDr. David Alan Gilbert     struct ibv_context *verbs;
3348c0d77702SMarkus Armbruster     int ret;
3349329c9b10SDr. David Alan Gilbert 
3350329c9b10SDr. David Alan Gilbert     ret = rdma_get_cm_event(rdma->channel, &cm_event);
33514a102179SMarkus Armbruster     if (ret < 0) {
3352329c9b10SDr. David Alan Gilbert         goto err_rdma_dest_wait;
3353329c9b10SDr. David Alan Gilbert     }
3354329c9b10SDr. David Alan Gilbert 
3355329c9b10SDr. David Alan Gilbert     if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
3356329c9b10SDr. David Alan Gilbert         rdma_ack_cm_event(cm_event);
3357329c9b10SDr. David Alan Gilbert         goto err_rdma_dest_wait;
3358329c9b10SDr. David Alan Gilbert     }
3359329c9b10SDr. David Alan Gilbert 
3360*69f7b00dSYu Zhang     isock->host = g_strdup(rdma->host);
33613fa9642fSHet Gala     isock->port = g_strdup_printf("%d", rdma->port);
33623fa9642fSHet Gala 
336344bcfd45SLi Zhijian     /*
336444bcfd45SLi Zhijian      * initialize the RDMAContext for return path for postcopy after first
336544bcfd45SLi Zhijian      * connection request reached.
336644bcfd45SLi Zhijian      */
336738ad1110SJuan Quintela     if ((migrate_postcopy() || migrate_return_path())
3368a5382214SDr. David Alan Gilbert         && !rdma->is_return_path) {
33693fa9642fSHet Gala         rdma_return_path = qemu_rdma_data_init(isock, NULL);
337044bcfd45SLi Zhijian         if (rdma_return_path == NULL) {
337144bcfd45SLi Zhijian             rdma_ack_cm_event(cm_event);
337244bcfd45SLi Zhijian             goto err_rdma_dest_wait;
337344bcfd45SLi Zhijian         }
337444bcfd45SLi Zhijian 
337544bcfd45SLi Zhijian         qemu_rdma_return_path_dest_init(rdma_return_path, rdma);
337644bcfd45SLi Zhijian     }
337744bcfd45SLi Zhijian 
3378329c9b10SDr. David Alan Gilbert     memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
3379329c9b10SDr. David Alan Gilbert 
3380329c9b10SDr. David Alan Gilbert     network_to_caps(&cap);
3381329c9b10SDr. David Alan Gilbert 
3382329c9b10SDr. David Alan Gilbert     if (cap.version < 1 || cap.version > RDMA_CONTROL_VERSION_CURRENT) {
3383733252deSDr. David Alan Gilbert         error_report("Unknown source RDMA version: %d, bailing...",
3384329c9b10SDr. David Alan Gilbert                      cap.version);
3385329c9b10SDr. David Alan Gilbert         rdma_ack_cm_event(cm_event);
3386329c9b10SDr. David Alan Gilbert         goto err_rdma_dest_wait;
3387329c9b10SDr. David Alan Gilbert     }
3388329c9b10SDr. David Alan Gilbert 
3389329c9b10SDr. David Alan Gilbert     /*
3390329c9b10SDr. David Alan Gilbert      * Respond with only the capabilities this version of QEMU knows about.
3391329c9b10SDr. David Alan Gilbert      */
3392329c9b10SDr. David Alan Gilbert     cap.flags &= known_capabilities;
3393329c9b10SDr. David Alan Gilbert 
3394329c9b10SDr. David Alan Gilbert     /*
3395329c9b10SDr. David Alan Gilbert      * Enable the ones that we do know about.
3396329c9b10SDr. David Alan Gilbert      * Add other checks here as new ones are introduced.
3397329c9b10SDr. David Alan Gilbert      */
3398329c9b10SDr. David Alan Gilbert     if (cap.flags & RDMA_CAPABILITY_PIN_ALL) {
3399329c9b10SDr. David Alan Gilbert         rdma->pin_all = true;
3400329c9b10SDr. David Alan Gilbert     }
3401329c9b10SDr. David Alan Gilbert 
3402329c9b10SDr. David Alan Gilbert     rdma->cm_id = cm_event->id;
3403329c9b10SDr. David Alan Gilbert     verbs = cm_event->id->verbs;
3404329c9b10SDr. David Alan Gilbert 
3405329c9b10SDr. David Alan Gilbert     rdma_ack_cm_event(cm_event);
3406329c9b10SDr. David Alan Gilbert 
3407733252deSDr. David Alan Gilbert     trace_qemu_rdma_accept_pin_state(rdma->pin_all);
3408329c9b10SDr. David Alan Gilbert 
3409329c9b10SDr. David Alan Gilbert     caps_to_network(&cap);
3410329c9b10SDr. David Alan Gilbert 
3411733252deSDr. David Alan Gilbert     trace_qemu_rdma_accept_pin_verbsc(verbs);
3412329c9b10SDr. David Alan Gilbert 
3413329c9b10SDr. David Alan Gilbert     if (!rdma->verbs) {
3414329c9b10SDr. David Alan Gilbert         rdma->verbs = verbs;
3415329c9b10SDr. David Alan Gilbert     } else if (rdma->verbs != verbs) {
3416733252deSDr. David Alan Gilbert         error_report("ibv context not matching %p, %p!", rdma->verbs,
3417733252deSDr. David Alan Gilbert                      verbs);
3418329c9b10SDr. David Alan Gilbert         goto err_rdma_dest_wait;
3419329c9b10SDr. David Alan Gilbert     }
3420329c9b10SDr. David Alan Gilbert 
3421329c9b10SDr. David Alan Gilbert     qemu_rdma_dump_id("dest_init", verbs);
3422329c9b10SDr. David Alan Gilbert 
342307d5b946SMarkus Armbruster     ret = qemu_rdma_alloc_pd_cq(rdma, &err);
34244a102179SMarkus Armbruster     if (ret < 0) {
342507d5b946SMarkus Armbruster         error_report_err(err);
3426329c9b10SDr. David Alan Gilbert         goto err_rdma_dest_wait;
3427329c9b10SDr. David Alan Gilbert     }
3428329c9b10SDr. David Alan Gilbert 
3429329c9b10SDr. David Alan Gilbert     ret = qemu_rdma_alloc_qp(rdma);
34304a102179SMarkus Armbruster     if (ret < 0) {
3431733252deSDr. David Alan Gilbert         error_report("rdma migration: error allocating qp!");
3432329c9b10SDr. David Alan Gilbert         goto err_rdma_dest_wait;
3433329c9b10SDr. David Alan Gilbert     }
3434329c9b10SDr. David Alan Gilbert 
34350610d7a1SMarkus Armbruster     qemu_rdma_init_ram_blocks(rdma);
3436329c9b10SDr. David Alan Gilbert 
3437ebdb85f9SJuan Quintela     for (int i = 0; i < RDMA_WRID_MAX; i++) {
3438ebdb85f9SJuan Quintela         ret = qemu_rdma_reg_control(rdma, i);
34394a102179SMarkus Armbruster         if (ret < 0) {
3440ebdb85f9SJuan Quintela             error_report("rdma: error registering %d control", i);
3441329c9b10SDr. David Alan Gilbert             goto err_rdma_dest_wait;
3442329c9b10SDr. David Alan Gilbert         }
3443329c9b10SDr. David Alan Gilbert     }
3444329c9b10SDr. David Alan Gilbert 
344555cc1b59SLidong Chen     /* Accept the second connection request for return path */
344638ad1110SJuan Quintela     if ((migrate_postcopy() || migrate_return_path())
3447a5382214SDr. David Alan Gilbert         && !rdma->is_return_path) {
344855cc1b59SLidong Chen         qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
344955cc1b59SLidong Chen                             NULL,
345055cc1b59SLidong Chen                             (void *)(intptr_t)rdma->return_path);
345155cc1b59SLidong Chen     } else {
345292370989SLidong Chen         qemu_set_fd_handler(rdma->channel->fd, rdma_cm_poll_handler,
345392370989SLidong Chen                             NULL, rdma);
345455cc1b59SLidong Chen     }
3455329c9b10SDr. David Alan Gilbert 
3456329c9b10SDr. David Alan Gilbert     ret = rdma_accept(rdma->cm_id, &conn_param);
34574a102179SMarkus Armbruster     if (ret < 0) {
34581b6e1da6SMarkus Armbruster         error_report("rdma_accept failed");
3459329c9b10SDr. David Alan Gilbert         goto err_rdma_dest_wait;
3460329c9b10SDr. David Alan Gilbert     }
3461329c9b10SDr. David Alan Gilbert 
3462329c9b10SDr. David Alan Gilbert     ret = rdma_get_cm_event(rdma->channel, &cm_event);
34634a102179SMarkus Armbruster     if (ret < 0) {
34641b6e1da6SMarkus Armbruster         error_report("rdma_accept get_cm_event failed");
3465329c9b10SDr. David Alan Gilbert         goto err_rdma_dest_wait;
3466329c9b10SDr. David Alan Gilbert     }
3467329c9b10SDr. David Alan Gilbert 
3468329c9b10SDr. David Alan Gilbert     if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
3469733252deSDr. David Alan Gilbert         error_report("rdma_accept not event established");
3470329c9b10SDr. David Alan Gilbert         rdma_ack_cm_event(cm_event);
3471329c9b10SDr. David Alan Gilbert         goto err_rdma_dest_wait;
3472329c9b10SDr. David Alan Gilbert     }
3473329c9b10SDr. David Alan Gilbert 
3474329c9b10SDr. David Alan Gilbert     rdma_ack_cm_event(cm_event);
3475329c9b10SDr. David Alan Gilbert     rdma->connected = true;
3476329c9b10SDr. David Alan Gilbert 
34773c0c3ebaSMarkus Armbruster     ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY, &err);
34784a102179SMarkus Armbruster     if (ret < 0) {
34793c0c3ebaSMarkus Armbruster         error_report_err(err);
3480329c9b10SDr. David Alan Gilbert         goto err_rdma_dest_wait;
3481329c9b10SDr. David Alan Gilbert     }
3482329c9b10SDr. David Alan Gilbert 
3483329c9b10SDr. David Alan Gilbert     qemu_rdma_dump_gid("dest_connect", rdma->cm_id);
3484329c9b10SDr. David Alan Gilbert 
3485329c9b10SDr. David Alan Gilbert     return 0;
3486329c9b10SDr. David Alan Gilbert 
3487329c9b10SDr. David Alan Gilbert err_rdma_dest_wait:
3488b86c94a4SMarkus Armbruster     rdma->errored = true;
3489329c9b10SDr. David Alan Gilbert     qemu_rdma_cleanup(rdma);
349044bcfd45SLi Zhijian     g_free(rdma_return_path);
3491ec486974SMarkus Armbruster     return -1;
3492329c9b10SDr. David Alan Gilbert }
3493329c9b10SDr. David Alan Gilbert 
dest_ram_sort_func(const void * a,const void * b)3494e4d63320SDr. David Alan Gilbert static int dest_ram_sort_func(const void *a, const void *b)
3495e4d63320SDr. David Alan Gilbert {
3496e4d63320SDr. David Alan Gilbert     unsigned int a_index = ((const RDMALocalBlock *)a)->src_index;
3497e4d63320SDr. David Alan Gilbert     unsigned int b_index = ((const RDMALocalBlock *)b)->src_index;
3498e4d63320SDr. David Alan Gilbert 
3499e4d63320SDr. David Alan Gilbert     return (a_index < b_index) ? -1 : (a_index != b_index);
3500e4d63320SDr. David Alan Gilbert }
3501e4d63320SDr. David Alan Gilbert 
3502329c9b10SDr. David Alan Gilbert /*
3503329c9b10SDr. David Alan Gilbert  * During each iteration of the migration, we listen for instructions
3504329c9b10SDr. David Alan Gilbert  * by the source VM to perform dynamic page registrations before they
3505329c9b10SDr. David Alan Gilbert  * can perform RDMA operations.
3506329c9b10SDr. David Alan Gilbert  *
3507329c9b10SDr. David Alan Gilbert  * We respond with the 'rkey'.
3508329c9b10SDr. David Alan Gilbert  *
3509329c9b10SDr. David Alan Gilbert  * Keep doing this until the source tells us to stop.
3510329c9b10SDr. David Alan Gilbert  */
rdma_registration_handle(QEMUFile * f)3511b1b38387SJuan Quintela int rdma_registration_handle(QEMUFile *f)
3512329c9b10SDr. David Alan Gilbert {
3513329c9b10SDr. David Alan Gilbert     RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult),
3514329c9b10SDr. David Alan Gilbert                                .type = RDMA_CONTROL_REGISTER_RESULT,
3515329c9b10SDr. David Alan Gilbert                                .repeat = 0,
3516329c9b10SDr. David Alan Gilbert                              };
3517329c9b10SDr. David Alan Gilbert     RDMAControlHeader unreg_resp = { .len = 0,
3518329c9b10SDr. David Alan Gilbert                                .type = RDMA_CONTROL_UNREGISTER_FINISHED,
3519329c9b10SDr. David Alan Gilbert                                .repeat = 0,
3520329c9b10SDr. David Alan Gilbert                              };
3521329c9b10SDr. David Alan Gilbert     RDMAControlHeader blocks = { .type = RDMA_CONTROL_RAM_BLOCKS_RESULT,
3522329c9b10SDr. David Alan Gilbert                                  .repeat = 1 };
3523f6d6c089SJuan Quintela     QIOChannelRDMA *rioc;
352496f363d8SMarkus Armbruster     Error *err = NULL;
352574637e6fSLidong Chen     RDMAContext *rdma;
352674637e6fSLidong Chen     RDMALocalBlocks *local;
3527329c9b10SDr. David Alan Gilbert     RDMAControlHeader head;
3528329c9b10SDr. David Alan Gilbert     RDMARegister *reg, *registers;
3529329c9b10SDr. David Alan Gilbert     RDMACompress *comp;
3530329c9b10SDr. David Alan Gilbert     RDMARegisterResult *reg_result;
3531329c9b10SDr. David Alan Gilbert     static RDMARegisterResult results[RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE];
3532329c9b10SDr. David Alan Gilbert     RDMALocalBlock *block;
3533329c9b10SDr. David Alan Gilbert     void *host_addr;
3534c0d77702SMarkus Armbruster     int ret;
3535329c9b10SDr. David Alan Gilbert     int idx = 0;
3536329c9b10SDr. David Alan Gilbert 
3537f6d6c089SJuan Quintela     if (!migrate_rdma()) {
3538f6d6c089SJuan Quintela         return 0;
3539f6d6c089SJuan Quintela     }
3540f6d6c089SJuan Quintela 
3541987ab2a5SDr. David Alan Gilbert     RCU_READ_LOCK_GUARD();
3542f6d6c089SJuan Quintela     rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
3543d73415a3SStefan Hajnoczi     rdma = qatomic_rcu_read(&rioc->rdmain);
354474637e6fSLidong Chen 
354574637e6fSLidong Chen     if (!rdma) {
35460110c6b8SMarkus Armbruster         return -1;
354774637e6fSLidong Chen     }
354874637e6fSLidong Chen 
3549b86c94a4SMarkus Armbruster     if (rdma_errored(rdma)) {
35500110c6b8SMarkus Armbruster         return -1;
3551de3e05e8SMarkus Armbruster     }
3552329c9b10SDr. David Alan Gilbert 
355374637e6fSLidong Chen     local = &rdma->local_ram_blocks;
3554329c9b10SDr. David Alan Gilbert     do {
3555b1b38387SJuan Quintela         trace_rdma_registration_handle_wait();
3556329c9b10SDr. David Alan Gilbert 
355796f363d8SMarkus Armbruster         ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_NONE, &err);
3558329c9b10SDr. David Alan Gilbert 
3559329c9b10SDr. David Alan Gilbert         if (ret < 0) {
356096f363d8SMarkus Armbruster             error_report_err(err);
3561329c9b10SDr. David Alan Gilbert             break;
3562329c9b10SDr. David Alan Gilbert         }
3563329c9b10SDr. David Alan Gilbert 
3564329c9b10SDr. David Alan Gilbert         if (head.repeat > RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE) {
3565733252deSDr. David Alan Gilbert             error_report("rdma: Too many requests in this message (%d)."
3566733252deSDr. David Alan Gilbert                             "Bailing.", head.repeat);
3567329c9b10SDr. David Alan Gilbert             break;
3568329c9b10SDr. David Alan Gilbert         }
3569329c9b10SDr. David Alan Gilbert 
3570329c9b10SDr. David Alan Gilbert         switch (head.type) {
3571329c9b10SDr. David Alan Gilbert         case RDMA_CONTROL_COMPRESS:
3572329c9b10SDr. David Alan Gilbert             comp = (RDMACompress *) rdma->wr_data[idx].control_curr;
3573329c9b10SDr. David Alan Gilbert             network_to_compress(comp);
3574329c9b10SDr. David Alan Gilbert 
3575b1b38387SJuan Quintela             trace_rdma_registration_handle_compress(comp->length,
3576733252deSDr. David Alan Gilbert                                                     comp->block_idx,
3577733252deSDr. David Alan Gilbert                                                     comp->offset);
3578afcddefdSDr. David Alan Gilbert             if (comp->block_idx >= rdma->local_ram_blocks.nb_blocks) {
3579afcddefdSDr. David Alan Gilbert                 error_report("rdma: 'compress' bad block index %u (vs %d)",
3580afcddefdSDr. David Alan Gilbert                              (unsigned int)comp->block_idx,
3581afcddefdSDr. David Alan Gilbert                              rdma->local_ram_blocks.nb_blocks);
35820110c6b8SMarkus Armbruster                 goto err;
3583afcddefdSDr. David Alan Gilbert             }
3584329c9b10SDr. David Alan Gilbert             block = &(rdma->local_ram_blocks.block[comp->block_idx]);
3585329c9b10SDr. David Alan Gilbert 
3586329c9b10SDr. David Alan Gilbert             host_addr = block->local_host_addr +
3587329c9b10SDr. David Alan Gilbert                             (comp->offset - block->offset);
3588413d64feSJuan Quintela             if (comp->value) {
3589413d64feSJuan Quintela                 error_report("rdma: Zero page with non-zero (%d) value",
3590413d64feSJuan Quintela                              comp->value);
3591413d64feSJuan Quintela                 goto err;
3592413d64feSJuan Quintela             }
35937091dabeSJuan Quintela             ram_handle_zero(host_addr, comp->length);
3594329c9b10SDr. David Alan Gilbert             break;
3595329c9b10SDr. David Alan Gilbert 
3596329c9b10SDr. David Alan Gilbert         case RDMA_CONTROL_REGISTER_FINISHED:
3597b1b38387SJuan Quintela             trace_rdma_registration_handle_finished();
35980110c6b8SMarkus Armbruster             return 0;
3599329c9b10SDr. David Alan Gilbert 
3600329c9b10SDr. David Alan Gilbert         case RDMA_CONTROL_RAM_BLOCKS_REQUEST:
3601b1b38387SJuan Quintela             trace_rdma_registration_handle_ram_blocks();
3602329c9b10SDr. David Alan Gilbert 
3603e4d63320SDr. David Alan Gilbert             /* Sort our local RAM Block list so it's the same as the source,
3604e4d63320SDr. David Alan Gilbert              * we can do this since we've filled in a src_index in the list
3605e4d63320SDr. David Alan Gilbert              * as we received the RAMBlock list earlier.
3606e4d63320SDr. David Alan Gilbert              */
3607e4d63320SDr. David Alan Gilbert             qsort(rdma->local_ram_blocks.block,
3608e4d63320SDr. David Alan Gilbert                   rdma->local_ram_blocks.nb_blocks,
3609e4d63320SDr. David Alan Gilbert                   sizeof(RDMALocalBlock), dest_ram_sort_func);
361014e2fcbbSJuan Quintela             for (int i = 0; i < local->nb_blocks; i++) {
361171cd7306SLidong Chen                 local->block[i].index = i;
361271cd7306SLidong Chen             }
361371cd7306SLidong Chen 
3614329c9b10SDr. David Alan Gilbert             if (rdma->pin_all) {
3615de1aa35fSMarkus Armbruster                 ret = qemu_rdma_reg_whole_ram_blocks(rdma, &err);
36164a102179SMarkus Armbruster                 if (ret < 0) {
3617de1aa35fSMarkus Armbruster                     error_report_err(err);
36180110c6b8SMarkus Armbruster                     goto err;
3619329c9b10SDr. David Alan Gilbert                 }
3620329c9b10SDr. David Alan Gilbert             }
3621329c9b10SDr. David Alan Gilbert 
3622329c9b10SDr. David Alan Gilbert             /*
3623329c9b10SDr. David Alan Gilbert              * Dest uses this to prepare to transmit the RAMBlock descriptions
3624329c9b10SDr. David Alan Gilbert              * to the source VM after connection setup.
3625329c9b10SDr. David Alan Gilbert              * Both sides use the "remote" structure to communicate and update
3626329c9b10SDr. David Alan Gilbert              * their "local" descriptions with what was sent.
3627329c9b10SDr. David Alan Gilbert              */
362814e2fcbbSJuan Quintela             for (int i = 0; i < local->nb_blocks; i++) {
3629a97270adSDr. David Alan Gilbert                 rdma->dest_blocks[i].remote_host_addr =
3630fbce8c25SStefan Weil                     (uintptr_t)(local->block[i].local_host_addr);
3631329c9b10SDr. David Alan Gilbert 
3632329c9b10SDr. David Alan Gilbert                 if (rdma->pin_all) {
3633a97270adSDr. David Alan Gilbert                     rdma->dest_blocks[i].remote_rkey = local->block[i].mr->rkey;
3634329c9b10SDr. David Alan Gilbert                 }
3635329c9b10SDr. David Alan Gilbert 
3636a97270adSDr. David Alan Gilbert                 rdma->dest_blocks[i].offset = local->block[i].offset;
3637a97270adSDr. David Alan Gilbert                 rdma->dest_blocks[i].length = local->block[i].length;
3638329c9b10SDr. David Alan Gilbert 
3639a97270adSDr. David Alan Gilbert                 dest_block_to_network(&rdma->dest_blocks[i]);
3640b1b38387SJuan Quintela                 trace_rdma_registration_handle_ram_blocks_loop(
3641e4d63320SDr. David Alan Gilbert                     local->block[i].block_name,
3642e4d63320SDr. David Alan Gilbert                     local->block[i].offset,
3643e4d63320SDr. David Alan Gilbert                     local->block[i].length,
3644e4d63320SDr. David Alan Gilbert                     local->block[i].local_host_addr,
3645e4d63320SDr. David Alan Gilbert                     local->block[i].src_index);
3646329c9b10SDr. David Alan Gilbert             }
3647329c9b10SDr. David Alan Gilbert 
3648329c9b10SDr. David Alan Gilbert             blocks.len = rdma->local_ram_blocks.nb_blocks
3649a97270adSDr. David Alan Gilbert                                                 * sizeof(RDMADestBlock);
3650329c9b10SDr. David Alan Gilbert 
3651329c9b10SDr. David Alan Gilbert 
3652329c9b10SDr. David Alan Gilbert             ret = qemu_rdma_post_send_control(rdma,
3653f3805964SMarkus Armbruster                                     (uint8_t *) rdma->dest_blocks, &blocks,
3654f3805964SMarkus Armbruster                                     &err);
3655329c9b10SDr. David Alan Gilbert 
3656329c9b10SDr. David Alan Gilbert             if (ret < 0) {
3657f3805964SMarkus Armbruster                 error_report_err(err);
36580110c6b8SMarkus Armbruster                 goto err;
3659329c9b10SDr. David Alan Gilbert             }
3660329c9b10SDr. David Alan Gilbert 
3661329c9b10SDr. David Alan Gilbert             break;
3662329c9b10SDr. David Alan Gilbert         case RDMA_CONTROL_REGISTER_REQUEST:
3663b1b38387SJuan Quintela             trace_rdma_registration_handle_register(head.repeat);
3664329c9b10SDr. David Alan Gilbert 
3665329c9b10SDr. David Alan Gilbert             reg_resp.repeat = head.repeat;
3666329c9b10SDr. David Alan Gilbert             registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
3667329c9b10SDr. David Alan Gilbert 
366814e2fcbbSJuan Quintela             for (int count = 0; count < head.repeat; count++) {
3669329c9b10SDr. David Alan Gilbert                 uint64_t chunk;
3670329c9b10SDr. David Alan Gilbert                 uint8_t *chunk_start, *chunk_end;
3671329c9b10SDr. David Alan Gilbert 
3672329c9b10SDr. David Alan Gilbert                 reg = &registers[count];
3673329c9b10SDr. David Alan Gilbert                 network_to_register(reg);
3674329c9b10SDr. David Alan Gilbert 
3675329c9b10SDr. David Alan Gilbert                 reg_result = &results[count];
3676329c9b10SDr. David Alan Gilbert 
3677b1b38387SJuan Quintela                 trace_rdma_registration_handle_register_loop(count,
3678329c9b10SDr. David Alan Gilbert                          reg->current_index, reg->key.current_addr, reg->chunks);
3679329c9b10SDr. David Alan Gilbert 
3680afcddefdSDr. David Alan Gilbert                 if (reg->current_index >= rdma->local_ram_blocks.nb_blocks) {
3681afcddefdSDr. David Alan Gilbert                     error_report("rdma: 'register' bad block index %u (vs %d)",
3682afcddefdSDr. David Alan Gilbert                                  (unsigned int)reg->current_index,
3683afcddefdSDr. David Alan Gilbert                                  rdma->local_ram_blocks.nb_blocks);
36840110c6b8SMarkus Armbruster                     goto err;
3685afcddefdSDr. David Alan Gilbert                 }
3686329c9b10SDr. David Alan Gilbert                 block = &(rdma->local_ram_blocks.block[reg->current_index]);
3687329c9b10SDr. David Alan Gilbert                 if (block->is_ram_block) {
3688afcddefdSDr. David Alan Gilbert                     if (block->offset > reg->key.current_addr) {
3689afcddefdSDr. David Alan Gilbert                         error_report("rdma: bad register address for block %s"
3690afcddefdSDr. David Alan Gilbert                             " offset: %" PRIx64 " current_addr: %" PRIx64,
3691afcddefdSDr. David Alan Gilbert                             block->block_name, block->offset,
3692afcddefdSDr. David Alan Gilbert                             reg->key.current_addr);
36930110c6b8SMarkus Armbruster                         goto err;
3694afcddefdSDr. David Alan Gilbert                     }
3695329c9b10SDr. David Alan Gilbert                     host_addr = (block->local_host_addr +
3696329c9b10SDr. David Alan Gilbert                                 (reg->key.current_addr - block->offset));
3697329c9b10SDr. David Alan Gilbert                     chunk = ram_chunk_index(block->local_host_addr,
3698329c9b10SDr. David Alan Gilbert                                             (uint8_t *) host_addr);
3699329c9b10SDr. David Alan Gilbert                 } else {
3700329c9b10SDr. David Alan Gilbert                     chunk = reg->key.chunk;
3701329c9b10SDr. David Alan Gilbert                     host_addr = block->local_host_addr +
3702329c9b10SDr. David Alan Gilbert                         (reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT));
3703afcddefdSDr. David Alan Gilbert                     /* Check for particularly bad chunk value */
3704afcddefdSDr. David Alan Gilbert                     if (host_addr < (void *)block->local_host_addr) {
3705afcddefdSDr. David Alan Gilbert                         error_report("rdma: bad chunk for block %s"
3706afcddefdSDr. David Alan Gilbert                             " chunk: %" PRIx64,
3707afcddefdSDr. David Alan Gilbert                             block->block_name, reg->key.chunk);
37080110c6b8SMarkus Armbruster                         goto err;
3709afcddefdSDr. David Alan Gilbert                     }
3710329c9b10SDr. David Alan Gilbert                 }
3711329c9b10SDr. David Alan Gilbert                 chunk_start = ram_chunk_start(block, chunk);
3712329c9b10SDr. David Alan Gilbert                 chunk_end = ram_chunk_end(block, chunk + reg->chunks);
37139589e763SMarcel Apfelbaum                 /* avoid "-Waddress-of-packed-member" warning */
37149589e763SMarcel Apfelbaum                 uint32_t tmp_rkey = 0;
3715329c9b10SDr. David Alan Gilbert                 if (qemu_rdma_register_and_get_keys(rdma, block,
37169589e763SMarcel Apfelbaum                             (uintptr_t)host_addr, NULL, &tmp_rkey,
3717329c9b10SDr. David Alan Gilbert                             chunk, chunk_start, chunk_end)) {
3718733252deSDr. David Alan Gilbert                     error_report("cannot get rkey");
37190110c6b8SMarkus Armbruster                     goto err;
3720329c9b10SDr. David Alan Gilbert                 }
37219589e763SMarcel Apfelbaum                 reg_result->rkey = tmp_rkey;
3722329c9b10SDr. David Alan Gilbert 
3723fbce8c25SStefan Weil                 reg_result->host_addr = (uintptr_t)block->local_host_addr;
3724329c9b10SDr. David Alan Gilbert 
3725b1b38387SJuan Quintela                 trace_rdma_registration_handle_register_rkey(reg_result->rkey);
3726329c9b10SDr. David Alan Gilbert 
3727329c9b10SDr. David Alan Gilbert                 result_to_network(reg_result);
3728329c9b10SDr. David Alan Gilbert             }
3729329c9b10SDr. David Alan Gilbert 
3730329c9b10SDr. David Alan Gilbert             ret = qemu_rdma_post_send_control(rdma,
3731f3805964SMarkus Armbruster                             (uint8_t *) results, &reg_resp, &err);
3732329c9b10SDr. David Alan Gilbert 
3733329c9b10SDr. David Alan Gilbert             if (ret < 0) {
3734f3805964SMarkus Armbruster                 error_report_err(err);
37350110c6b8SMarkus Armbruster                 goto err;
3736329c9b10SDr. David Alan Gilbert             }
3737329c9b10SDr. David Alan Gilbert             break;
3738329c9b10SDr. David Alan Gilbert         case RDMA_CONTROL_UNREGISTER_REQUEST:
3739b1b38387SJuan Quintela             trace_rdma_registration_handle_unregister(head.repeat);
3740329c9b10SDr. David Alan Gilbert             unreg_resp.repeat = head.repeat;
3741329c9b10SDr. David Alan Gilbert             registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
3742329c9b10SDr. David Alan Gilbert 
374314e2fcbbSJuan Quintela             for (int count = 0; count < head.repeat; count++) {
3744329c9b10SDr. David Alan Gilbert                 reg = &registers[count];
3745329c9b10SDr. David Alan Gilbert                 network_to_register(reg);
3746329c9b10SDr. David Alan Gilbert 
3747b1b38387SJuan Quintela                 trace_rdma_registration_handle_unregister_loop(count,
3748733252deSDr. David Alan Gilbert                            reg->current_index, reg->key.chunk);
3749329c9b10SDr. David Alan Gilbert 
3750329c9b10SDr. David Alan Gilbert                 block = &(rdma->local_ram_blocks.block[reg->current_index]);
3751329c9b10SDr. David Alan Gilbert 
3752329c9b10SDr. David Alan Gilbert                 ret = ibv_dereg_mr(block->pmr[reg->key.chunk]);
3753329c9b10SDr. David Alan Gilbert                 block->pmr[reg->key.chunk] = NULL;
3754329c9b10SDr. David Alan Gilbert 
3755329c9b10SDr. David Alan Gilbert                 if (ret != 0) {
3756ff4c9194SMarkus Armbruster                     error_report("rdma unregistration chunk failed: %s",
3757ff4c9194SMarkus Armbruster                                  strerror(errno));
37580110c6b8SMarkus Armbruster                     goto err;
3759329c9b10SDr. David Alan Gilbert                 }
3760329c9b10SDr. David Alan Gilbert 
3761329c9b10SDr. David Alan Gilbert                 rdma->total_registrations--;
3762329c9b10SDr. David Alan Gilbert 
3763b1b38387SJuan Quintela                 trace_rdma_registration_handle_unregister_success(reg->key.chunk);
3764329c9b10SDr. David Alan Gilbert             }
3765329c9b10SDr. David Alan Gilbert 
3766f3805964SMarkus Armbruster             ret = qemu_rdma_post_send_control(rdma, NULL, &unreg_resp, &err);
3767329c9b10SDr. David Alan Gilbert 
3768329c9b10SDr. David Alan Gilbert             if (ret < 0) {
3769f3805964SMarkus Armbruster                 error_report_err(err);
37700110c6b8SMarkus Armbruster                 goto err;
3771329c9b10SDr. David Alan Gilbert             }
3772329c9b10SDr. David Alan Gilbert             break;
3773329c9b10SDr. David Alan Gilbert         case RDMA_CONTROL_REGISTER_RESULT:
3774733252deSDr. David Alan Gilbert             error_report("Invalid RESULT message at dest.");
37750110c6b8SMarkus Armbruster             goto err;
3776329c9b10SDr. David Alan Gilbert         default:
3777482a33c5SDr. David Alan Gilbert             error_report("Unknown control message %s", control_desc(head.type));
37780110c6b8SMarkus Armbruster             goto err;
3779329c9b10SDr. David Alan Gilbert         }
3780329c9b10SDr. David Alan Gilbert     } while (1);
37810110c6b8SMarkus Armbruster 
37820110c6b8SMarkus Armbruster err:
3783b86c94a4SMarkus Armbruster     rdma->errored = true;
37840110c6b8SMarkus Armbruster     return -1;
3785329c9b10SDr. David Alan Gilbert }
3786329c9b10SDr. David Alan Gilbert 
3787e4d63320SDr. David Alan Gilbert /* Destination:
3788a6323300SJuan Quintela  * Called during the initial RAM load section which lists the
3789a6323300SJuan Quintela  * RAMBlocks by name.  This lets us know the order of the RAMBlocks on
3790a6323300SJuan Quintela  * the source.  We've already built our local RAMBlock list, but not
3791a6323300SJuan Quintela  * yet sent the list to the source.
3792e4d63320SDr. David Alan Gilbert  */
rdma_block_notification_handle(QEMUFile * f,const char * name)3793a6323300SJuan Quintela int rdma_block_notification_handle(QEMUFile *f, const char *name)
3794e4d63320SDr. David Alan Gilbert {
3795e4d63320SDr. David Alan Gilbert     int curr;
3796e4d63320SDr. David Alan Gilbert     int found = -1;
3797e4d63320SDr. David Alan Gilbert 
3798a6323300SJuan Quintela     if (!migrate_rdma()) {
3799a6323300SJuan Quintela         return 0;
3800a6323300SJuan Quintela     }
3801a6323300SJuan Quintela 
3802987ab2a5SDr. David Alan Gilbert     RCU_READ_LOCK_GUARD();
3803a6323300SJuan Quintela     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
3804a6323300SJuan Quintela     RDMAContext *rdma = qatomic_rcu_read(&rioc->rdmain);
380574637e6fSLidong Chen 
380674637e6fSLidong Chen     if (!rdma) {
38070110c6b8SMarkus Armbruster         return -1;
380874637e6fSLidong Chen     }
380974637e6fSLidong Chen 
3810e4d63320SDr. David Alan Gilbert     /* Find the matching RAMBlock in our local list */
3811e4d63320SDr. David Alan Gilbert     for (curr = 0; curr < rdma->local_ram_blocks.nb_blocks; curr++) {
3812e4d63320SDr. David Alan Gilbert         if (!strcmp(rdma->local_ram_blocks.block[curr].block_name, name)) {
3813e4d63320SDr. David Alan Gilbert             found = curr;
3814e4d63320SDr. David Alan Gilbert             break;
3815e4d63320SDr. David Alan Gilbert         }
3816e4d63320SDr. David Alan Gilbert     }
3817e4d63320SDr. David Alan Gilbert 
3818e4d63320SDr. David Alan Gilbert     if (found == -1) {
3819e4d63320SDr. David Alan Gilbert         error_report("RAMBlock '%s' not found on destination", name);
38200110c6b8SMarkus Armbruster         return -1;
3821e4d63320SDr. David Alan Gilbert     }
3822e4d63320SDr. David Alan Gilbert 
3823e4d63320SDr. David Alan Gilbert     rdma->local_ram_blocks.block[curr].src_index = rdma->next_src_index;
3824e4d63320SDr. David Alan Gilbert     trace_rdma_block_notification_handle(name, rdma->next_src_index);
3825e4d63320SDr. David Alan Gilbert     rdma->next_src_index++;
3826e4d63320SDr. David Alan Gilbert 
3827e4d63320SDr. David Alan Gilbert     return 0;
3828e4d63320SDr. David Alan Gilbert }
3829e4d63320SDr. David Alan Gilbert 
rdma_registration_start(QEMUFile * f,uint64_t flags)3830b1b38387SJuan Quintela int rdma_registration_start(QEMUFile *f, uint64_t flags)
3831329c9b10SDr. David Alan Gilbert {
383248408174SJuan Quintela     if (!migrate_rdma() || migration_in_postcopy()) {
3833cd01a602SJuan Quintela         return 0;
3834cd01a602SJuan Quintela     }
3835cd01a602SJuan Quintela 
383648408174SJuan Quintela     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
3837987ab2a5SDr. David Alan Gilbert     RCU_READ_LOCK_GUARD();
383848408174SJuan Quintela     RDMAContext *rdma = qatomic_rcu_read(&rioc->rdmaout);
383974637e6fSLidong Chen     if (!rdma) {
38400110c6b8SMarkus Armbruster         return -1;
384174637e6fSLidong Chen     }
3842329c9b10SDr. David Alan Gilbert 
3843b86c94a4SMarkus Armbruster     if (rdma_errored(rdma)) {
38440110c6b8SMarkus Armbruster         return -1;
3845de3e05e8SMarkus Armbruster     }
3846329c9b10SDr. David Alan Gilbert 
3847b1b38387SJuan Quintela     trace_rdma_registration_start(flags);
3848329c9b10SDr. David Alan Gilbert     qemu_put_be64(f, RAM_SAVE_FLAG_HOOK);
3849be07a0edSJuan Quintela     return qemu_fflush(f);
3850329c9b10SDr. David Alan Gilbert }
3851329c9b10SDr. David Alan Gilbert 
3852329c9b10SDr. David Alan Gilbert /*
3853329c9b10SDr. David Alan Gilbert  * Inform dest that dynamic registrations are done for now.
3854329c9b10SDr. David Alan Gilbert  * First, flush writes, if any.
3855329c9b10SDr. David Alan Gilbert  */
rdma_registration_stop(QEMUFile * f,uint64_t flags)3856b1b38387SJuan Quintela int rdma_registration_stop(QEMUFile *f, uint64_t flags)
3857329c9b10SDr. David Alan Gilbert {
38585f5b8858SJuan Quintela     QIOChannelRDMA *rioc;
3859c4c78dceSMarkus Armbruster     Error *err = NULL;
386074637e6fSLidong Chen     RDMAContext *rdma;
3861329c9b10SDr. David Alan Gilbert     RDMAControlHeader head = { .len = 0, .repeat = 1 };
3862c0d77702SMarkus Armbruster     int ret;
3863329c9b10SDr. David Alan Gilbert 
38645f5b8858SJuan Quintela     if (!migrate_rdma() || migration_in_postcopy()) {
3865cd01a602SJuan Quintela         return 0;
3866cd01a602SJuan Quintela     }
3867cd01a602SJuan Quintela 
3868987ab2a5SDr. David Alan Gilbert     RCU_READ_LOCK_GUARD();
38695f5b8858SJuan Quintela     rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
3870d73415a3SStefan Hajnoczi     rdma = qatomic_rcu_read(&rioc->rdmaout);
387174637e6fSLidong Chen     if (!rdma) {
38720110c6b8SMarkus Armbruster         return -1;
387374637e6fSLidong Chen     }
387474637e6fSLidong Chen 
3875b86c94a4SMarkus Armbruster     if (rdma_errored(rdma)) {
38760110c6b8SMarkus Armbruster         return -1;
3877de3e05e8SMarkus Armbruster     }
3878329c9b10SDr. David Alan Gilbert 
3879329c9b10SDr. David Alan Gilbert     qemu_fflush(f);
3880e3378035SJuan Quintela     ret = qemu_rdma_drain_cq(rdma);
3881329c9b10SDr. David Alan Gilbert 
3882329c9b10SDr. David Alan Gilbert     if (ret < 0) {
3883329c9b10SDr. David Alan Gilbert         goto err;
3884329c9b10SDr. David Alan Gilbert     }
3885329c9b10SDr. David Alan Gilbert 
3886329c9b10SDr. David Alan Gilbert     if (flags == RAM_CONTROL_SETUP) {
3887329c9b10SDr. David Alan Gilbert         RDMAControlHeader resp = {.type = RDMA_CONTROL_RAM_BLOCKS_RESULT };
3888329c9b10SDr. David Alan Gilbert         RDMALocalBlocks *local = &rdma->local_ram_blocks;
388914e2fcbbSJuan Quintela         int reg_result_idx, nb_dest_blocks;
3890329c9b10SDr. David Alan Gilbert 
3891329c9b10SDr. David Alan Gilbert         head.type = RDMA_CONTROL_RAM_BLOCKS_REQUEST;
3892b1b38387SJuan Quintela         trace_rdma_registration_stop_ram();
3893329c9b10SDr. David Alan Gilbert 
3894329c9b10SDr. David Alan Gilbert         /*
3895329c9b10SDr. David Alan Gilbert          * Make sure that we parallelize the pinning on both sides.
3896329c9b10SDr. David Alan Gilbert          * For very large guests, doing this serially takes a really
3897329c9b10SDr. David Alan Gilbert          * long time, so we have to 'interleave' the pinning locally
3898329c9b10SDr. David Alan Gilbert          * with the control messages by performing the pinning on this
3899329c9b10SDr. David Alan Gilbert          * side before we receive the control response from the other
3900329c9b10SDr. David Alan Gilbert          * side that the pinning has completed.
3901329c9b10SDr. David Alan Gilbert          */
3902329c9b10SDr. David Alan Gilbert         ret = qemu_rdma_exchange_send(rdma, &head, NULL, &resp,
3903329c9b10SDr. David Alan Gilbert                     &reg_result_idx, rdma->pin_all ?
3904c4c78dceSMarkus Armbruster                     qemu_rdma_reg_whole_ram_blocks : NULL,
3905c4c78dceSMarkus Armbruster                     &err);
3906329c9b10SDr. David Alan Gilbert         if (ret < 0) {
3907c4c78dceSMarkus Armbruster             error_report_err(err);
39080110c6b8SMarkus Armbruster             return -1;
3909329c9b10SDr. David Alan Gilbert         }
3910329c9b10SDr. David Alan Gilbert 
3911a97270adSDr. David Alan Gilbert         nb_dest_blocks = resp.len / sizeof(RDMADestBlock);
3912329c9b10SDr. David Alan Gilbert 
3913329c9b10SDr. David Alan Gilbert         /*
3914329c9b10SDr. David Alan Gilbert          * The protocol uses two different sets of rkeys (mutually exclusive):
3915329c9b10SDr. David Alan Gilbert          * 1. One key to represent the virtual address of the entire ram block.
3916329c9b10SDr. David Alan Gilbert          *    (dynamic chunk registration disabled - pin everything with one rkey.)
3917329c9b10SDr. David Alan Gilbert          * 2. One to represent individual chunks within a ram block.
3918329c9b10SDr. David Alan Gilbert          *    (dynamic chunk registration enabled - pin individual chunks.)
3919329c9b10SDr. David Alan Gilbert          *
3920329c9b10SDr. David Alan Gilbert          * Once the capability is successfully negotiated, the destination transmits
3921329c9b10SDr. David Alan Gilbert          * the keys to use (or sends them later) including the virtual addresses
3922329c9b10SDr. David Alan Gilbert          * and then propagates the remote ram block descriptions to his local copy.
3923329c9b10SDr. David Alan Gilbert          */
3924329c9b10SDr. David Alan Gilbert 
3925a97270adSDr. David Alan Gilbert         if (local->nb_blocks != nb_dest_blocks) {
3926ff4c9194SMarkus Armbruster             error_report("ram blocks mismatch (Number of blocks %d vs %d)",
3927e4d63320SDr. David Alan Gilbert                          local->nb_blocks, nb_dest_blocks);
3928ff4c9194SMarkus Armbruster             error_printf("Your QEMU command line parameters are probably "
3929ff4c9194SMarkus Armbruster                          "not identical on both the source and destination.");
3930b86c94a4SMarkus Armbruster             rdma->errored = true;
39310110c6b8SMarkus Armbruster             return -1;
3932329c9b10SDr. David Alan Gilbert         }
3933329c9b10SDr. David Alan Gilbert 
3934329c9b10SDr. David Alan Gilbert         qemu_rdma_move_header(rdma, reg_result_idx, &resp);
3935a97270adSDr. David Alan Gilbert         memcpy(rdma->dest_blocks,
3936329c9b10SDr. David Alan Gilbert             rdma->wr_data[reg_result_idx].control_curr, resp.len);
393714e2fcbbSJuan Quintela         for (int i = 0; i < nb_dest_blocks; i++) {
3938a97270adSDr. David Alan Gilbert             network_to_dest_block(&rdma->dest_blocks[i]);
3939329c9b10SDr. David Alan Gilbert 
3940e4d63320SDr. David Alan Gilbert             /* We require that the blocks are in the same order */
3941e4d63320SDr. David Alan Gilbert             if (rdma->dest_blocks[i].length != local->block[i].length) {
3942ff4c9194SMarkus Armbruster                 error_report("Block %s/%d has a different length %" PRIu64
3943ff4c9194SMarkus Armbruster                              "vs %" PRIu64,
3944ff4c9194SMarkus Armbruster                              local->block[i].block_name, i,
3945e4d63320SDr. David Alan Gilbert                              local->block[i].length,
3946e4d63320SDr. David Alan Gilbert                              rdma->dest_blocks[i].length);
3947b86c94a4SMarkus Armbruster                 rdma->errored = true;
39480110c6b8SMarkus Armbruster                 return -1;
3949329c9b10SDr. David Alan Gilbert             }
3950e4d63320SDr. David Alan Gilbert             local->block[i].remote_host_addr =
3951a97270adSDr. David Alan Gilbert                     rdma->dest_blocks[i].remote_host_addr;
3952e4d63320SDr. David Alan Gilbert             local->block[i].remote_rkey = rdma->dest_blocks[i].remote_rkey;
3953329c9b10SDr. David Alan Gilbert         }
3954329c9b10SDr. David Alan Gilbert     }
3955329c9b10SDr. David Alan Gilbert 
3956b1b38387SJuan Quintela     trace_rdma_registration_stop(flags);
3957329c9b10SDr. David Alan Gilbert 
3958329c9b10SDr. David Alan Gilbert     head.type = RDMA_CONTROL_REGISTER_FINISHED;
3959c4c78dceSMarkus Armbruster     ret = qemu_rdma_exchange_send(rdma, &head, NULL, NULL, NULL, NULL, &err);
3960329c9b10SDr. David Alan Gilbert 
3961329c9b10SDr. David Alan Gilbert     if (ret < 0) {
3962c4c78dceSMarkus Armbruster         error_report_err(err);
3963329c9b10SDr. David Alan Gilbert         goto err;
3964329c9b10SDr. David Alan Gilbert     }
3965329c9b10SDr. David Alan Gilbert 
3966329c9b10SDr. David Alan Gilbert     return 0;
3967329c9b10SDr. David Alan Gilbert err:
3968b86c94a4SMarkus Armbruster     rdma->errored = true;
39690110c6b8SMarkus Armbruster     return -1;
3970329c9b10SDr. David Alan Gilbert }
3971329c9b10SDr. David Alan Gilbert 
qio_channel_rdma_finalize(Object * obj)39726ddd2d76SDaniel P. Berrange static void qio_channel_rdma_finalize(Object *obj)
3973329c9b10SDr. David Alan Gilbert {
39746ddd2d76SDaniel P. Berrange     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(obj);
397574637e6fSLidong Chen     if (rioc->rdmain) {
397674637e6fSLidong Chen         qemu_rdma_cleanup(rioc->rdmain);
397774637e6fSLidong Chen         g_free(rioc->rdmain);
397874637e6fSLidong Chen         rioc->rdmain = NULL;
397974637e6fSLidong Chen     }
398074637e6fSLidong Chen     if (rioc->rdmaout) {
398174637e6fSLidong Chen         qemu_rdma_cleanup(rioc->rdmaout);
398274637e6fSLidong Chen         g_free(rioc->rdmaout);
398374637e6fSLidong Chen         rioc->rdmaout = NULL;
39846ddd2d76SDaniel P. Berrange     }
39856ddd2d76SDaniel P. Berrange }
39866ddd2d76SDaniel P. Berrange 
qio_channel_rdma_class_init(ObjectClass * klass,void * class_data G_GNUC_UNUSED)39876ddd2d76SDaniel P. Berrange static void qio_channel_rdma_class_init(ObjectClass *klass,
39886ddd2d76SDaniel P. Berrange                                         void *class_data G_GNUC_UNUSED)
39896ddd2d76SDaniel P. Berrange {
39906ddd2d76SDaniel P. Berrange     QIOChannelClass *ioc_klass = QIO_CHANNEL_CLASS(klass);
39916ddd2d76SDaniel P. Berrange 
39926ddd2d76SDaniel P. Berrange     ioc_klass->io_writev = qio_channel_rdma_writev;
39936ddd2d76SDaniel P. Berrange     ioc_klass->io_readv = qio_channel_rdma_readv;
39946ddd2d76SDaniel P. Berrange     ioc_klass->io_set_blocking = qio_channel_rdma_set_blocking;
39956ddd2d76SDaniel P. Berrange     ioc_klass->io_close = qio_channel_rdma_close;
39966ddd2d76SDaniel P. Berrange     ioc_klass->io_create_watch = qio_channel_rdma_create_watch;
39974d9f675bSLidong Chen     ioc_klass->io_set_aio_fd_handler = qio_channel_rdma_set_aio_fd_handler;
399854db882fSLidong Chen     ioc_klass->io_shutdown = qio_channel_rdma_shutdown;
39996ddd2d76SDaniel P. Berrange }
40006ddd2d76SDaniel P. Berrange 
40016ddd2d76SDaniel P. Berrange static const TypeInfo qio_channel_rdma_info = {
40026ddd2d76SDaniel P. Berrange     .parent = TYPE_QIO_CHANNEL,
40036ddd2d76SDaniel P. Berrange     .name = TYPE_QIO_CHANNEL_RDMA,
40046ddd2d76SDaniel P. Berrange     .instance_size = sizeof(QIOChannelRDMA),
40056ddd2d76SDaniel P. Berrange     .instance_finalize = qio_channel_rdma_finalize,
40066ddd2d76SDaniel P. Berrange     .class_init = qio_channel_rdma_class_init,
40076ddd2d76SDaniel P. Berrange };
40086ddd2d76SDaniel P. Berrange 
qio_channel_rdma_register_types(void)40096ddd2d76SDaniel P. Berrange static void qio_channel_rdma_register_types(void)
40106ddd2d76SDaniel P. Berrange {
40116ddd2d76SDaniel P. Berrange     type_register_static(&qio_channel_rdma_info);
40126ddd2d76SDaniel P. Berrange }
40136ddd2d76SDaniel P. Berrange 
40146ddd2d76SDaniel P. Berrange type_init(qio_channel_rdma_register_types);
40156ddd2d76SDaniel P. Berrange 
rdma_new_input(RDMAContext * rdma)4016697c4c86SJuan Quintela static QEMUFile *rdma_new_input(RDMAContext *rdma)
40176ddd2d76SDaniel P. Berrange {
4018697c4c86SJuan Quintela     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(object_new(TYPE_QIO_CHANNEL_RDMA));
4019329c9b10SDr. David Alan Gilbert 
402077ef2dc1SDaniel P. Berrangé     rioc->file = qemu_file_new_input(QIO_CHANNEL(rioc));
402174637e6fSLidong Chen     rioc->rdmain = rdma;
402274637e6fSLidong Chen     rioc->rdmaout = rdma->return_path;
4023697c4c86SJuan Quintela 
4024697c4c86SJuan Quintela     return rioc->file;
4025329c9b10SDr. David Alan Gilbert }
4026329c9b10SDr. David Alan Gilbert 
rdma_new_output(RDMAContext * rdma)4027697c4c86SJuan Quintela static QEMUFile *rdma_new_output(RDMAContext *rdma)
4028697c4c86SJuan Quintela {
4029697c4c86SJuan Quintela     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(object_new(TYPE_QIO_CHANNEL_RDMA));
4030697c4c86SJuan Quintela 
4031697c4c86SJuan Quintela     rioc->file = qemu_file_new_output(QIO_CHANNEL(rioc));
4032697c4c86SJuan Quintela     rioc->rdmaout = rdma;
4033697c4c86SJuan Quintela     rioc->rdmain = rdma->return_path;
4034697c4c86SJuan Quintela 
40356ddd2d76SDaniel P. Berrange     return rioc->file;
4036329c9b10SDr. David Alan Gilbert }
4037329c9b10SDr. David Alan Gilbert 
rdma_accept_incoming_migration(void * opaque)4038329c9b10SDr. David Alan Gilbert static void rdma_accept_incoming_migration(void *opaque)
4039329c9b10SDr. David Alan Gilbert {
4040329c9b10SDr. David Alan Gilbert     RDMAContext *rdma = opaque;
4041329c9b10SDr. David Alan Gilbert     QEMUFile *f;
4042329c9b10SDr. David Alan Gilbert 
404324ec68efSDr. David Alan Gilbert     trace_qemu_rdma_accept_incoming_migration();
40448f5a7faaSJuan Quintela     if (qemu_rdma_accept(rdma) < 0) {
4045ff4c9194SMarkus Armbruster         error_report("RDMA ERROR: Migration initialization failed");
4046329c9b10SDr. David Alan Gilbert         return;
4047329c9b10SDr. David Alan Gilbert     }
4048329c9b10SDr. David Alan Gilbert 
404924ec68efSDr. David Alan Gilbert     trace_qemu_rdma_accept_incoming_migration_accepted();
4050329c9b10SDr. David Alan Gilbert 
405155cc1b59SLidong Chen     if (rdma->is_return_path) {
405255cc1b59SLidong Chen         return;
405355cc1b59SLidong Chen     }
405455cc1b59SLidong Chen 
4055697c4c86SJuan Quintela     f = rdma_new_input(rdma);
4056329c9b10SDr. David Alan Gilbert     if (f == NULL) {
4057ff4c9194SMarkus Armbruster         error_report("RDMA ERROR: could not open RDMA for input");
4058329c9b10SDr. David Alan Gilbert         qemu_rdma_cleanup(rdma);
4059329c9b10SDr. David Alan Gilbert         return;
4060329c9b10SDr. David Alan Gilbert     }
4061329c9b10SDr. David Alan Gilbert 
4062329c9b10SDr. David Alan Gilbert     rdma->migration_started_on_destination = 1;
4063b0cf3bfcSAvihai Horon     migration_fd_process_incoming(f);
4064329c9b10SDr. David Alan Gilbert }
4065329c9b10SDr. David Alan Gilbert 
rdma_start_incoming_migration(InetSocketAddress * host_port,Error ** errp)40663fa9642fSHet Gala void rdma_start_incoming_migration(InetSocketAddress *host_port,
40673fa9642fSHet Gala                                    Error **errp)
4068329c9b10SDr. David Alan Gilbert {
406927fd25b0SJuan Quintela     MigrationState *s = migrate_get_current();
4070329c9b10SDr. David Alan Gilbert     int ret;
4071bf027419SLi Zhijian     RDMAContext *rdma;
4072329c9b10SDr. David Alan Gilbert 
4073733252deSDr. David Alan Gilbert     trace_rdma_start_incoming_migration();
4074329c9b10SDr. David Alan Gilbert 
40755f1f1902SDavid Hildenbrand     /* Avoid ram_block_discard_disable(), cannot change during migration. */
40765f1f1902SDavid Hildenbrand     if (ram_block_discard_is_required()) {
40775f1f1902SDavid Hildenbrand         error_setg(errp, "RDMA: cannot disable RAM discard");
40785f1f1902SDavid Hildenbrand         return;
40795f1f1902SDavid Hildenbrand     }
40805f1f1902SDavid Hildenbrand 
4081b16defbbSMarkus Armbruster     rdma = qemu_rdma_data_init(host_port, errp);
4082329c9b10SDr. David Alan Gilbert     if (rdma == NULL) {
4083329c9b10SDr. David Alan Gilbert         goto err;
4084329c9b10SDr. David Alan Gilbert     }
4085329c9b10SDr. David Alan Gilbert 
4086b16defbbSMarkus Armbruster     ret = qemu_rdma_dest_init(rdma, errp);
40874a102179SMarkus Armbruster     if (ret < 0) {
4088329c9b10SDr. David Alan Gilbert         goto err;
4089329c9b10SDr. David Alan Gilbert     }
4090329c9b10SDr. David Alan Gilbert 
4091733252deSDr. David Alan Gilbert     trace_rdma_start_incoming_migration_after_dest_init();
4092329c9b10SDr. David Alan Gilbert 
4093329c9b10SDr. David Alan Gilbert     ret = rdma_listen(rdma->listen_id, 5);
4094329c9b10SDr. David Alan Gilbert 
40954a102179SMarkus Armbruster     if (ret < 0) {
40968fd471bdSMarkus Armbruster         error_setg(errp, "RDMA ERROR: listening on socket!");
40974e812d23SLi Zhijian         goto cleanup_rdma;
4098329c9b10SDr. David Alan Gilbert     }
4099329c9b10SDr. David Alan Gilbert 
4100733252deSDr. David Alan Gilbert     trace_rdma_start_incoming_migration_after_rdma_listen();
410127fd25b0SJuan Quintela     s->rdma_migration = true;
410282e1cc4bSFam Zheng     qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
410382e1cc4bSFam Zheng                         NULL, (void *)(intptr_t)rdma);
4104329c9b10SDr. David Alan Gilbert     return;
41054e812d23SLi Zhijian 
41064e812d23SLi Zhijian cleanup_rdma:
41074e812d23SLi Zhijian     qemu_rdma_cleanup(rdma);
4108329c9b10SDr. David Alan Gilbert err:
41093b59ee72SPan Nengyuan     if (rdma) {
411059c59c67SPan Nengyuan         g_free(rdma->host);
41113b59ee72SPan Nengyuan     }
4112329c9b10SDr. David Alan Gilbert     g_free(rdma);
4113329c9b10SDr. David Alan Gilbert }
4114329c9b10SDr. David Alan Gilbert 
rdma_start_outgoing_migration(void * opaque,InetSocketAddress * host_port,Error ** errp)4115329c9b10SDr. David Alan Gilbert void rdma_start_outgoing_migration(void *opaque,
41163fa9642fSHet Gala                             InetSocketAddress *host_port, Error **errp)
4117329c9b10SDr. David Alan Gilbert {
4118329c9b10SDr. David Alan Gilbert     MigrationState *s = opaque;
411955cc1b59SLidong Chen     RDMAContext *rdma_return_path = NULL;
41205f1f1902SDavid Hildenbrand     RDMAContext *rdma;
4121c0d77702SMarkus Armbruster     int ret;
4122329c9b10SDr. David Alan Gilbert 
41235f1f1902SDavid Hildenbrand     /* Avoid ram_block_discard_disable(), cannot change during migration. */
41245f1f1902SDavid Hildenbrand     if (ram_block_discard_is_required()) {
41255f1f1902SDavid Hildenbrand         error_setg(errp, "RDMA: cannot disable RAM discard");
41265f1f1902SDavid Hildenbrand         return;
41275f1f1902SDavid Hildenbrand     }
41285f1f1902SDavid Hildenbrand 
41295f1f1902SDavid Hildenbrand     rdma = qemu_rdma_data_init(host_port, errp);
4130329c9b10SDr. David Alan Gilbert     if (rdma == NULL) {
4131329c9b10SDr. David Alan Gilbert         goto err;
4132329c9b10SDr. David Alan Gilbert     }
4133329c9b10SDr. David Alan Gilbert 
413417cba690SJuan Quintela     ret = qemu_rdma_source_init(rdma, migrate_rdma_pin_all(), errp);
4135329c9b10SDr. David Alan Gilbert 
41364a102179SMarkus Armbruster     if (ret < 0) {
4137329c9b10SDr. David Alan Gilbert         goto err;
4138329c9b10SDr. David Alan Gilbert     }
4139329c9b10SDr. David Alan Gilbert 
4140733252deSDr. David Alan Gilbert     trace_rdma_start_outgoing_migration_after_rdma_source_init();
41413c03f21cSMarkus Armbruster     ret = qemu_rdma_connect(rdma, false, errp);
4142329c9b10SDr. David Alan Gilbert 
41434a102179SMarkus Armbruster     if (ret < 0) {
4144329c9b10SDr. David Alan Gilbert         goto err;
4145329c9b10SDr. David Alan Gilbert     }
4146329c9b10SDr. David Alan Gilbert 
41473a4452d8Szhaolichang     /* RDMA postcopy need a separate queue pair for return path */
414838ad1110SJuan Quintela     if (migrate_postcopy() || migrate_return_path()) {
414955cc1b59SLidong Chen         rdma_return_path = qemu_rdma_data_init(host_port, errp);
415055cc1b59SLidong Chen 
415155cc1b59SLidong Chen         if (rdma_return_path == NULL) {
41522f0c285aSPan Nengyuan             goto return_path_err;
415355cc1b59SLidong Chen         }
415455cc1b59SLidong Chen 
415555cc1b59SLidong Chen         ret = qemu_rdma_source_init(rdma_return_path,
415617cba690SJuan Quintela                                     migrate_rdma_pin_all(), errp);
415755cc1b59SLidong Chen 
41584a102179SMarkus Armbruster         if (ret < 0) {
41592f0c285aSPan Nengyuan             goto return_path_err;
416055cc1b59SLidong Chen         }
416155cc1b59SLidong Chen 
41623c03f21cSMarkus Armbruster         ret = qemu_rdma_connect(rdma_return_path, true, errp);
416355cc1b59SLidong Chen 
41644a102179SMarkus Armbruster         if (ret < 0) {
41652f0c285aSPan Nengyuan             goto return_path_err;
416655cc1b59SLidong Chen         }
416755cc1b59SLidong Chen 
416855cc1b59SLidong Chen         rdma->return_path = rdma_return_path;
416955cc1b59SLidong Chen         rdma_return_path->return_path = rdma;
417055cc1b59SLidong Chen         rdma_return_path->is_return_path = true;
417155cc1b59SLidong Chen     }
417255cc1b59SLidong Chen 
4173733252deSDr. David Alan Gilbert     trace_rdma_start_outgoing_migration_after_rdma_connect();
4174329c9b10SDr. David Alan Gilbert 
4175697c4c86SJuan Quintela     s->to_dst_file = rdma_new_output(rdma);
417627fd25b0SJuan Quintela     s->rdma_migration = true;
4177cce8040bSDr. David Alan Gilbert     migrate_fd_connect(s, NULL);
4178329c9b10SDr. David Alan Gilbert     return;
41792f0c285aSPan Nengyuan return_path_err:
41802f0c285aSPan Nengyuan     qemu_rdma_cleanup(rdma);
4181329c9b10SDr. David Alan Gilbert err:
4182329c9b10SDr. David Alan Gilbert     g_free(rdma);
418355cc1b59SLidong Chen     g_free(rdma_return_path);
4184329c9b10SDr. David Alan Gilbert }
4185