xref: /openbmc/qemu/migration/rdma.c (revision c61d2faa934ecafbba42d5e410298115a483175e)
1329c9b10SDr. David Alan Gilbert /*
2329c9b10SDr. David Alan Gilbert  * RDMA protocol and interfaces
3329c9b10SDr. David Alan Gilbert  *
4329c9b10SDr. David Alan Gilbert  * Copyright IBM, Corp. 2010-2013
56ddd2d76SDaniel P. Berrange  * Copyright Red Hat, Inc. 2015-2016
6329c9b10SDr. David Alan Gilbert  *
7329c9b10SDr. David Alan Gilbert  * Authors:
8329c9b10SDr. David Alan Gilbert  *  Michael R. Hines <mrhines@us.ibm.com>
9329c9b10SDr. David Alan Gilbert  *  Jiuxing Liu <jl@us.ibm.com>
106ddd2d76SDaniel P. Berrange  *  Daniel P. Berrange <berrange@redhat.com>
11329c9b10SDr. David Alan Gilbert  *
12329c9b10SDr. David Alan Gilbert  * This work is licensed under the terms of the GNU GPL, version 2 or
13329c9b10SDr. David Alan Gilbert  * later.  See the COPYING file in the top-level directory.
14329c9b10SDr. David Alan Gilbert  *
15329c9b10SDr. David Alan Gilbert  */
160b8fa32fSMarkus Armbruster 
171393a485SPeter Maydell #include "qemu/osdep.h"
18da34e65cSMarkus Armbruster #include "qapi/error.h"
19f348b6d1SVeronia Bahaa #include "qemu/cutils.h"
20*c61d2faaSJuan Quintela #include "exec/target_page.h"
21e1a3eceeSJuan Quintela #include "rdma.h"
226666c96aSJuan Quintela #include "migration.h"
23*c61d2faaSJuan Quintela #include "migration-stats.h"
2408a0aee1SJuan Quintela #include "qemu-file.h"
257b1e1a22SJuan Quintela #include "ram.h"
26d49b6836SMarkus Armbruster #include "qemu/error-report.h"
27329c9b10SDr. David Alan Gilbert #include "qemu/main-loop.h"
280b8fa32fSMarkus Armbruster #include "qemu/module.h"
29d4842052SMarkus Armbruster #include "qemu/rcu.h"
30329c9b10SDr. David Alan Gilbert #include "qemu/sockets.h"
31329c9b10SDr. David Alan Gilbert #include "qemu/bitmap.h"
3210817bf0SDaniel P. Berrange #include "qemu/coroutine.h"
335f1f1902SDavid Hildenbrand #include "exec/memory.h"
34329c9b10SDr. David Alan Gilbert #include <sys/socket.h>
35329c9b10SDr. David Alan Gilbert #include <netdb.h>
36329c9b10SDr. David Alan Gilbert #include <arpa/inet.h>
37329c9b10SDr. David Alan Gilbert #include <rdma/rdma_cma.h>
38733252deSDr. David Alan Gilbert #include "trace.h"
39db1015e9SEduardo Habkost #include "qom/object.h"
4017cba690SJuan Quintela #include "options.h"
41e49e49ddSLi Zhijian #include <poll.h>
42329c9b10SDr. David Alan Gilbert 
43329c9b10SDr. David Alan Gilbert /*
44329c9b10SDr. David Alan Gilbert  * Print and error on both the Monitor and the Log file.
45329c9b10SDr. David Alan Gilbert  */
46329c9b10SDr. David Alan Gilbert #define ERROR(errp, fmt, ...) \
47329c9b10SDr. David Alan Gilbert     do { \
48329c9b10SDr. David Alan Gilbert         fprintf(stderr, "RDMA ERROR: " fmt "\n", ## __VA_ARGS__); \
49329c9b10SDr. David Alan Gilbert         if (errp && (*(errp) == NULL)) { \
50329c9b10SDr. David Alan Gilbert             error_setg(errp, "RDMA ERROR: " fmt, ## __VA_ARGS__); \
51329c9b10SDr. David Alan Gilbert         } \
52329c9b10SDr. David Alan Gilbert     } while (0)
53329c9b10SDr. David Alan Gilbert 
54329c9b10SDr. David Alan Gilbert #define RDMA_RESOLVE_TIMEOUT_MS 10000
55329c9b10SDr. David Alan Gilbert 
56329c9b10SDr. David Alan Gilbert /* Do not merge data if larger than this. */
57329c9b10SDr. David Alan Gilbert #define RDMA_MERGE_MAX (2 * 1024 * 1024)
58329c9b10SDr. David Alan Gilbert #define RDMA_SIGNALED_SEND_MAX (RDMA_MERGE_MAX / 4096)
59329c9b10SDr. David Alan Gilbert 
60329c9b10SDr. David Alan Gilbert #define RDMA_REG_CHUNK_SHIFT 20 /* 1 MB */
61329c9b10SDr. David Alan Gilbert 
62329c9b10SDr. David Alan Gilbert /*
63329c9b10SDr. David Alan Gilbert  * This is only for non-live state being migrated.
64329c9b10SDr. David Alan Gilbert  * Instead of RDMA_WRITE messages, we use RDMA_SEND
65329c9b10SDr. David Alan Gilbert  * messages for that state, which requires a different
66329c9b10SDr. David Alan Gilbert  * delivery design than main memory.
67329c9b10SDr. David Alan Gilbert  */
68329c9b10SDr. David Alan Gilbert #define RDMA_SEND_INCREMENT 32768
69329c9b10SDr. David Alan Gilbert 
70329c9b10SDr. David Alan Gilbert /*
71329c9b10SDr. David Alan Gilbert  * Maximum size infiniband SEND message
72329c9b10SDr. David Alan Gilbert  */
73329c9b10SDr. David Alan Gilbert #define RDMA_CONTROL_MAX_BUFFER (512 * 1024)
74329c9b10SDr. David Alan Gilbert #define RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE 4096
75329c9b10SDr. David Alan Gilbert 
76329c9b10SDr. David Alan Gilbert #define RDMA_CONTROL_VERSION_CURRENT 1
77329c9b10SDr. David Alan Gilbert /*
78329c9b10SDr. David Alan Gilbert  * Capabilities for negotiation.
79329c9b10SDr. David Alan Gilbert  */
80329c9b10SDr. David Alan Gilbert #define RDMA_CAPABILITY_PIN_ALL 0x01
81329c9b10SDr. David Alan Gilbert 
82329c9b10SDr. David Alan Gilbert /*
83329c9b10SDr. David Alan Gilbert  * Add the other flags above to this list of known capabilities
84329c9b10SDr. David Alan Gilbert  * as they are introduced.
85329c9b10SDr. David Alan Gilbert  */
86329c9b10SDr. David Alan Gilbert static uint32_t known_capabilities = RDMA_CAPABILITY_PIN_ALL;
87329c9b10SDr. David Alan Gilbert 
88329c9b10SDr. David Alan Gilbert #define CHECK_ERROR_STATE() \
89329c9b10SDr. David Alan Gilbert     do { \
90329c9b10SDr. David Alan Gilbert         if (rdma->error_state) { \
91329c9b10SDr. David Alan Gilbert             if (!rdma->error_reported) { \
92733252deSDr. David Alan Gilbert                 error_report("RDMA is in an error state waiting migration" \
93733252deSDr. David Alan Gilbert                                 " to abort!"); \
94329c9b10SDr. David Alan Gilbert                 rdma->error_reported = 1; \
95329c9b10SDr. David Alan Gilbert             } \
96329c9b10SDr. David Alan Gilbert             return rdma->error_state; \
97329c9b10SDr. David Alan Gilbert         } \
982562755eSEric Blake     } while (0)
99329c9b10SDr. David Alan Gilbert 
100329c9b10SDr. David Alan Gilbert /*
101329c9b10SDr. David Alan Gilbert  * A work request ID is 64-bits and we split up these bits
102329c9b10SDr. David Alan Gilbert  * into 3 parts:
103329c9b10SDr. David Alan Gilbert  *
104329c9b10SDr. David Alan Gilbert  * bits 0-15 : type of control message, 2^16
105329c9b10SDr. David Alan Gilbert  * bits 16-29: ram block index, 2^14
106329c9b10SDr. David Alan Gilbert  * bits 30-63: ram block chunk number, 2^34
107329c9b10SDr. David Alan Gilbert  *
108329c9b10SDr. David Alan Gilbert  * The last two bit ranges are only used for RDMA writes,
109329c9b10SDr. David Alan Gilbert  * in order to track their completion and potentially
110329c9b10SDr. David Alan Gilbert  * also track unregistration status of the message.
111329c9b10SDr. David Alan Gilbert  */
112329c9b10SDr. David Alan Gilbert #define RDMA_WRID_TYPE_SHIFT  0UL
113329c9b10SDr. David Alan Gilbert #define RDMA_WRID_BLOCK_SHIFT 16UL
114329c9b10SDr. David Alan Gilbert #define RDMA_WRID_CHUNK_SHIFT 30UL
115329c9b10SDr. David Alan Gilbert 
116329c9b10SDr. David Alan Gilbert #define RDMA_WRID_TYPE_MASK \
117329c9b10SDr. David Alan Gilbert     ((1UL << RDMA_WRID_BLOCK_SHIFT) - 1UL)
118329c9b10SDr. David Alan Gilbert 
119329c9b10SDr. David Alan Gilbert #define RDMA_WRID_BLOCK_MASK \
120329c9b10SDr. David Alan Gilbert     (~RDMA_WRID_TYPE_MASK & ((1UL << RDMA_WRID_CHUNK_SHIFT) - 1UL))
121329c9b10SDr. David Alan Gilbert 
122329c9b10SDr. David Alan Gilbert #define RDMA_WRID_CHUNK_MASK (~RDMA_WRID_BLOCK_MASK & ~RDMA_WRID_TYPE_MASK)
123329c9b10SDr. David Alan Gilbert 
124329c9b10SDr. David Alan Gilbert /*
125329c9b10SDr. David Alan Gilbert  * RDMA migration protocol:
126329c9b10SDr. David Alan Gilbert  * 1. RDMA Writes (data messages, i.e. RAM)
127329c9b10SDr. David Alan Gilbert  * 2. IB Send/Recv (control channel messages)
128329c9b10SDr. David Alan Gilbert  */
129329c9b10SDr. David Alan Gilbert enum {
130329c9b10SDr. David Alan Gilbert     RDMA_WRID_NONE = 0,
131329c9b10SDr. David Alan Gilbert     RDMA_WRID_RDMA_WRITE = 1,
132329c9b10SDr. David Alan Gilbert     RDMA_WRID_SEND_CONTROL = 2000,
133329c9b10SDr. David Alan Gilbert     RDMA_WRID_RECV_CONTROL = 4000,
134329c9b10SDr. David Alan Gilbert };
135329c9b10SDr. David Alan Gilbert 
1362ae31aeaSStefan Weil static const char *wrid_desc[] = {
137329c9b10SDr. David Alan Gilbert     [RDMA_WRID_NONE] = "NONE",
138329c9b10SDr. David Alan Gilbert     [RDMA_WRID_RDMA_WRITE] = "WRITE RDMA",
139329c9b10SDr. David Alan Gilbert     [RDMA_WRID_SEND_CONTROL] = "CONTROL SEND",
140329c9b10SDr. David Alan Gilbert     [RDMA_WRID_RECV_CONTROL] = "CONTROL RECV",
141329c9b10SDr. David Alan Gilbert };
142329c9b10SDr. David Alan Gilbert 
143329c9b10SDr. David Alan Gilbert /*
144329c9b10SDr. David Alan Gilbert  * Work request IDs for IB SEND messages only (not RDMA writes).
145329c9b10SDr. David Alan Gilbert  * This is used by the migration protocol to transmit
146329c9b10SDr. David Alan Gilbert  * control messages (such as device state and registration commands)
147329c9b10SDr. David Alan Gilbert  *
148329c9b10SDr. David Alan Gilbert  * We could use more WRs, but we have enough for now.
149329c9b10SDr. David Alan Gilbert  */
150329c9b10SDr. David Alan Gilbert enum {
151329c9b10SDr. David Alan Gilbert     RDMA_WRID_READY = 0,
152329c9b10SDr. David Alan Gilbert     RDMA_WRID_DATA,
153329c9b10SDr. David Alan Gilbert     RDMA_WRID_CONTROL,
154329c9b10SDr. David Alan Gilbert     RDMA_WRID_MAX,
155329c9b10SDr. David Alan Gilbert };
156329c9b10SDr. David Alan Gilbert 
157329c9b10SDr. David Alan Gilbert /*
158329c9b10SDr. David Alan Gilbert  * SEND/RECV IB Control Messages.
159329c9b10SDr. David Alan Gilbert  */
160329c9b10SDr. David Alan Gilbert enum {
161329c9b10SDr. David Alan Gilbert     RDMA_CONTROL_NONE = 0,
162329c9b10SDr. David Alan Gilbert     RDMA_CONTROL_ERROR,
163329c9b10SDr. David Alan Gilbert     RDMA_CONTROL_READY,               /* ready to receive */
164329c9b10SDr. David Alan Gilbert     RDMA_CONTROL_QEMU_FILE,           /* QEMUFile-transmitted bytes */
165329c9b10SDr. David Alan Gilbert     RDMA_CONTROL_RAM_BLOCKS_REQUEST,  /* RAMBlock synchronization */
166329c9b10SDr. David Alan Gilbert     RDMA_CONTROL_RAM_BLOCKS_RESULT,   /* RAMBlock synchronization */
167329c9b10SDr. David Alan Gilbert     RDMA_CONTROL_COMPRESS,            /* page contains repeat values */
168329c9b10SDr. David Alan Gilbert     RDMA_CONTROL_REGISTER_REQUEST,    /* dynamic page registration */
169329c9b10SDr. David Alan Gilbert     RDMA_CONTROL_REGISTER_RESULT,     /* key to use after registration */
170329c9b10SDr. David Alan Gilbert     RDMA_CONTROL_REGISTER_FINISHED,   /* current iteration finished */
171329c9b10SDr. David Alan Gilbert     RDMA_CONTROL_UNREGISTER_REQUEST,  /* dynamic UN-registration */
172329c9b10SDr. David Alan Gilbert     RDMA_CONTROL_UNREGISTER_FINISHED, /* unpinning finished */
173329c9b10SDr. David Alan Gilbert };
174329c9b10SDr. David Alan Gilbert 
175329c9b10SDr. David Alan Gilbert 
176329c9b10SDr. David Alan Gilbert /*
177329c9b10SDr. David Alan Gilbert  * Memory and MR structures used to represent an IB Send/Recv work request.
178329c9b10SDr. David Alan Gilbert  * This is *not* used for RDMA writes, only IB Send/Recv.
179329c9b10SDr. David Alan Gilbert  */
180329c9b10SDr. David Alan Gilbert typedef struct {
181329c9b10SDr. David Alan Gilbert     uint8_t  control[RDMA_CONTROL_MAX_BUFFER]; /* actual buffer to register */
182329c9b10SDr. David Alan Gilbert     struct   ibv_mr *control_mr;               /* registration metadata */
183329c9b10SDr. David Alan Gilbert     size_t   control_len;                      /* length of the message */
184329c9b10SDr. David Alan Gilbert     uint8_t *control_curr;                     /* start of unconsumed bytes */
185329c9b10SDr. David Alan Gilbert } RDMAWorkRequestData;
186329c9b10SDr. David Alan Gilbert 
187329c9b10SDr. David Alan Gilbert /*
188329c9b10SDr. David Alan Gilbert  * Negotiate RDMA capabilities during connection-setup time.
189329c9b10SDr. David Alan Gilbert  */
190329c9b10SDr. David Alan Gilbert typedef struct {
191329c9b10SDr. David Alan Gilbert     uint32_t version;
192329c9b10SDr. David Alan Gilbert     uint32_t flags;
193329c9b10SDr. David Alan Gilbert } RDMACapabilities;
194329c9b10SDr. David Alan Gilbert 
195329c9b10SDr. David Alan Gilbert static void caps_to_network(RDMACapabilities *cap)
196329c9b10SDr. David Alan Gilbert {
197329c9b10SDr. David Alan Gilbert     cap->version = htonl(cap->version);
198329c9b10SDr. David Alan Gilbert     cap->flags = htonl(cap->flags);
199329c9b10SDr. David Alan Gilbert }
200329c9b10SDr. David Alan Gilbert 
201329c9b10SDr. David Alan Gilbert static void network_to_caps(RDMACapabilities *cap)
202329c9b10SDr. David Alan Gilbert {
203329c9b10SDr. David Alan Gilbert     cap->version = ntohl(cap->version);
204329c9b10SDr. David Alan Gilbert     cap->flags = ntohl(cap->flags);
205329c9b10SDr. David Alan Gilbert }
206329c9b10SDr. David Alan Gilbert 
207329c9b10SDr. David Alan Gilbert /*
208329c9b10SDr. David Alan Gilbert  * Representation of a RAMBlock from an RDMA perspective.
209329c9b10SDr. David Alan Gilbert  * This is not transmitted, only local.
210329c9b10SDr. David Alan Gilbert  * This and subsequent structures cannot be linked lists
211329c9b10SDr. David Alan Gilbert  * because we're using a single IB message to transmit
212329c9b10SDr. David Alan Gilbert  * the information. It's small anyway, so a list is overkill.
213329c9b10SDr. David Alan Gilbert  */
214329c9b10SDr. David Alan Gilbert typedef struct RDMALocalBlock {
2154fb5364bSDr. David Alan Gilbert     char          *block_name;
216329c9b10SDr. David Alan Gilbert     uint8_t       *local_host_addr; /* local virtual address */
217329c9b10SDr. David Alan Gilbert     uint64_t       remote_host_addr; /* remote virtual address */
218329c9b10SDr. David Alan Gilbert     uint64_t       offset;
219329c9b10SDr. David Alan Gilbert     uint64_t       length;
220329c9b10SDr. David Alan Gilbert     struct         ibv_mr **pmr;    /* MRs for chunk-level registration */
221329c9b10SDr. David Alan Gilbert     struct         ibv_mr *mr;      /* MR for non-chunk-level registration */
222329c9b10SDr. David Alan Gilbert     uint32_t      *remote_keys;     /* rkeys for chunk-level registration */
223329c9b10SDr. David Alan Gilbert     uint32_t       remote_rkey;     /* rkeys for non-chunk-level registration */
224329c9b10SDr. David Alan Gilbert     int            index;           /* which block are we */
225e4d63320SDr. David Alan Gilbert     unsigned int   src_index;       /* (Only used on dest) */
226329c9b10SDr. David Alan Gilbert     bool           is_ram_block;
227329c9b10SDr. David Alan Gilbert     int            nb_chunks;
228329c9b10SDr. David Alan Gilbert     unsigned long *transit_bitmap;
229329c9b10SDr. David Alan Gilbert     unsigned long *unregister_bitmap;
230329c9b10SDr. David Alan Gilbert } RDMALocalBlock;
231329c9b10SDr. David Alan Gilbert 
232329c9b10SDr. David Alan Gilbert /*
233329c9b10SDr. David Alan Gilbert  * Also represents a RAMblock, but only on the dest.
234329c9b10SDr. David Alan Gilbert  * This gets transmitted by the dest during connection-time
235329c9b10SDr. David Alan Gilbert  * to the source VM and then is used to populate the
236329c9b10SDr. David Alan Gilbert  * corresponding RDMALocalBlock with
237329c9b10SDr. David Alan Gilbert  * the information needed to perform the actual RDMA.
238329c9b10SDr. David Alan Gilbert  */
239a97270adSDr. David Alan Gilbert typedef struct QEMU_PACKED RDMADestBlock {
240329c9b10SDr. David Alan Gilbert     uint64_t remote_host_addr;
241329c9b10SDr. David Alan Gilbert     uint64_t offset;
242329c9b10SDr. David Alan Gilbert     uint64_t length;
243329c9b10SDr. David Alan Gilbert     uint32_t remote_rkey;
244329c9b10SDr. David Alan Gilbert     uint32_t padding;
245a97270adSDr. David Alan Gilbert } RDMADestBlock;
246329c9b10SDr. David Alan Gilbert 
247482a33c5SDr. David Alan Gilbert static const char *control_desc(unsigned int rdma_control)
248482a33c5SDr. David Alan Gilbert {
249482a33c5SDr. David Alan Gilbert     static const char *strs[] = {
250482a33c5SDr. David Alan Gilbert         [RDMA_CONTROL_NONE] = "NONE",
251482a33c5SDr. David Alan Gilbert         [RDMA_CONTROL_ERROR] = "ERROR",
252482a33c5SDr. David Alan Gilbert         [RDMA_CONTROL_READY] = "READY",
253482a33c5SDr. David Alan Gilbert         [RDMA_CONTROL_QEMU_FILE] = "QEMU FILE",
254482a33c5SDr. David Alan Gilbert         [RDMA_CONTROL_RAM_BLOCKS_REQUEST] = "RAM BLOCKS REQUEST",
255482a33c5SDr. David Alan Gilbert         [RDMA_CONTROL_RAM_BLOCKS_RESULT] = "RAM BLOCKS RESULT",
256482a33c5SDr. David Alan Gilbert         [RDMA_CONTROL_COMPRESS] = "COMPRESS",
257482a33c5SDr. David Alan Gilbert         [RDMA_CONTROL_REGISTER_REQUEST] = "REGISTER REQUEST",
258482a33c5SDr. David Alan Gilbert         [RDMA_CONTROL_REGISTER_RESULT] = "REGISTER RESULT",
259482a33c5SDr. David Alan Gilbert         [RDMA_CONTROL_REGISTER_FINISHED] = "REGISTER FINISHED",
260482a33c5SDr. David Alan Gilbert         [RDMA_CONTROL_UNREGISTER_REQUEST] = "UNREGISTER REQUEST",
261482a33c5SDr. David Alan Gilbert         [RDMA_CONTROL_UNREGISTER_FINISHED] = "UNREGISTER FINISHED",
262482a33c5SDr. David Alan Gilbert     };
263482a33c5SDr. David Alan Gilbert 
264482a33c5SDr. David Alan Gilbert     if (rdma_control > RDMA_CONTROL_UNREGISTER_FINISHED) {
265482a33c5SDr. David Alan Gilbert         return "??BAD CONTROL VALUE??";
266482a33c5SDr. David Alan Gilbert     }
267482a33c5SDr. David Alan Gilbert 
268482a33c5SDr. David Alan Gilbert     return strs[rdma_control];
269482a33c5SDr. David Alan Gilbert }
270482a33c5SDr. David Alan Gilbert 
271329c9b10SDr. David Alan Gilbert static uint64_t htonll(uint64_t v)
272329c9b10SDr. David Alan Gilbert {
273329c9b10SDr. David Alan Gilbert     union { uint32_t lv[2]; uint64_t llv; } u;
274329c9b10SDr. David Alan Gilbert     u.lv[0] = htonl(v >> 32);
275329c9b10SDr. David Alan Gilbert     u.lv[1] = htonl(v & 0xFFFFFFFFULL);
276329c9b10SDr. David Alan Gilbert     return u.llv;
277329c9b10SDr. David Alan Gilbert }
278329c9b10SDr. David Alan Gilbert 
279cbfc71b5SBihong Yu static uint64_t ntohll(uint64_t v)
280cbfc71b5SBihong Yu {
281329c9b10SDr. David Alan Gilbert     union { uint32_t lv[2]; uint64_t llv; } u;
282329c9b10SDr. David Alan Gilbert     u.llv = v;
283329c9b10SDr. David Alan Gilbert     return ((uint64_t)ntohl(u.lv[0]) << 32) | (uint64_t) ntohl(u.lv[1]);
284329c9b10SDr. David Alan Gilbert }
285329c9b10SDr. David Alan Gilbert 
286a97270adSDr. David Alan Gilbert static void dest_block_to_network(RDMADestBlock *db)
287329c9b10SDr. David Alan Gilbert {
288a97270adSDr. David Alan Gilbert     db->remote_host_addr = htonll(db->remote_host_addr);
289a97270adSDr. David Alan Gilbert     db->offset = htonll(db->offset);
290a97270adSDr. David Alan Gilbert     db->length = htonll(db->length);
291a97270adSDr. David Alan Gilbert     db->remote_rkey = htonl(db->remote_rkey);
292329c9b10SDr. David Alan Gilbert }
293329c9b10SDr. David Alan Gilbert 
294a97270adSDr. David Alan Gilbert static void network_to_dest_block(RDMADestBlock *db)
295329c9b10SDr. David Alan Gilbert {
296a97270adSDr. David Alan Gilbert     db->remote_host_addr = ntohll(db->remote_host_addr);
297a97270adSDr. David Alan Gilbert     db->offset = ntohll(db->offset);
298a97270adSDr. David Alan Gilbert     db->length = ntohll(db->length);
299a97270adSDr. David Alan Gilbert     db->remote_rkey = ntohl(db->remote_rkey);
300329c9b10SDr. David Alan Gilbert }
301329c9b10SDr. David Alan Gilbert 
302329c9b10SDr. David Alan Gilbert /*
303329c9b10SDr. David Alan Gilbert  * Virtual address of the above structures used for transmitting
304329c9b10SDr. David Alan Gilbert  * the RAMBlock descriptions at connection-time.
305329c9b10SDr. David Alan Gilbert  * This structure is *not* transmitted.
306329c9b10SDr. David Alan Gilbert  */
307329c9b10SDr. David Alan Gilbert typedef struct RDMALocalBlocks {
308329c9b10SDr. David Alan Gilbert     int nb_blocks;
309329c9b10SDr. David Alan Gilbert     bool     init;             /* main memory init complete */
310329c9b10SDr. David Alan Gilbert     RDMALocalBlock *block;
311329c9b10SDr. David Alan Gilbert } RDMALocalBlocks;
312329c9b10SDr. David Alan Gilbert 
313329c9b10SDr. David Alan Gilbert /*
314329c9b10SDr. David Alan Gilbert  * Main data structure for RDMA state.
315329c9b10SDr. David Alan Gilbert  * While there is only one copy of this structure being allocated right now,
316329c9b10SDr. David Alan Gilbert  * this is the place where one would start if you wanted to consider
317329c9b10SDr. David Alan Gilbert  * having more than one RDMA connection open at the same time.
318329c9b10SDr. David Alan Gilbert  */
319329c9b10SDr. David Alan Gilbert typedef struct RDMAContext {
320329c9b10SDr. David Alan Gilbert     char *host;
321329c9b10SDr. David Alan Gilbert     int port;
32244bcfd45SLi Zhijian     char *host_port;
323329c9b10SDr. David Alan Gilbert 
324329c9b10SDr. David Alan Gilbert     RDMAWorkRequestData wr_data[RDMA_WRID_MAX];
325329c9b10SDr. David Alan Gilbert 
326329c9b10SDr. David Alan Gilbert     /*
327329c9b10SDr. David Alan Gilbert      * This is used by *_exchange_send() to figure out whether or not
328329c9b10SDr. David Alan Gilbert      * the initial "READY" message has already been received or not.
329329c9b10SDr. David Alan Gilbert      * This is because other functions may potentially poll() and detect
330329c9b10SDr. David Alan Gilbert      * the READY message before send() does, in which case we need to
331329c9b10SDr. David Alan Gilbert      * know if it completed.
332329c9b10SDr. David Alan Gilbert      */
333329c9b10SDr. David Alan Gilbert     int control_ready_expected;
334329c9b10SDr. David Alan Gilbert 
335329c9b10SDr. David Alan Gilbert     /* number of outstanding writes */
336329c9b10SDr. David Alan Gilbert     int nb_sent;
337329c9b10SDr. David Alan Gilbert 
338329c9b10SDr. David Alan Gilbert     /* store info about current buffer so that we can
339329c9b10SDr. David Alan Gilbert        merge it with future sends */
340329c9b10SDr. David Alan Gilbert     uint64_t current_addr;
341329c9b10SDr. David Alan Gilbert     uint64_t current_length;
342329c9b10SDr. David Alan Gilbert     /* index of ram block the current buffer belongs to */
343329c9b10SDr. David Alan Gilbert     int current_index;
344329c9b10SDr. David Alan Gilbert     /* index of the chunk in the current ram block */
345329c9b10SDr. David Alan Gilbert     int current_chunk;
346329c9b10SDr. David Alan Gilbert 
347329c9b10SDr. David Alan Gilbert     bool pin_all;
348329c9b10SDr. David Alan Gilbert 
349329c9b10SDr. David Alan Gilbert     /*
350329c9b10SDr. David Alan Gilbert      * infiniband-specific variables for opening the device
351329c9b10SDr. David Alan Gilbert      * and maintaining connection state and so forth.
352329c9b10SDr. David Alan Gilbert      *
353329c9b10SDr. David Alan Gilbert      * cm_id also has ibv_context, rdma_event_channel, and ibv_qp in
354329c9b10SDr. David Alan Gilbert      * cm_id->verbs, cm_id->channel, and cm_id->qp.
355329c9b10SDr. David Alan Gilbert      */
356329c9b10SDr. David Alan Gilbert     struct rdma_cm_id *cm_id;               /* connection manager ID */
357329c9b10SDr. David Alan Gilbert     struct rdma_cm_id *listen_id;
358329c9b10SDr. David Alan Gilbert     bool connected;
359329c9b10SDr. David Alan Gilbert 
360329c9b10SDr. David Alan Gilbert     struct ibv_context          *verbs;
361329c9b10SDr. David Alan Gilbert     struct rdma_event_channel   *channel;
362329c9b10SDr. David Alan Gilbert     struct ibv_qp *qp;                      /* queue pair */
363b390afd8SLi Zhijian     struct ibv_comp_channel *recv_comp_channel;  /* recv completion channel */
364b390afd8SLi Zhijian     struct ibv_comp_channel *send_comp_channel;  /* send completion channel */
365329c9b10SDr. David Alan Gilbert     struct ibv_pd *pd;                      /* protection domain */
366b390afd8SLi Zhijian     struct ibv_cq *recv_cq;                 /* recvieve completion queue */
367b390afd8SLi Zhijian     struct ibv_cq *send_cq;                 /* send completion queue */
368329c9b10SDr. David Alan Gilbert 
369329c9b10SDr. David Alan Gilbert     /*
370329c9b10SDr. David Alan Gilbert      * If a previous write failed (perhaps because of a failed
371329c9b10SDr. David Alan Gilbert      * memory registration, then do not attempt any future work
372329c9b10SDr. David Alan Gilbert      * and remember the error state.
373329c9b10SDr. David Alan Gilbert      */
374329c9b10SDr. David Alan Gilbert     int error_state;
375329c9b10SDr. David Alan Gilbert     int error_reported;
376cd5ea070SDr. David Alan Gilbert     int received_error;
377329c9b10SDr. David Alan Gilbert 
378329c9b10SDr. David Alan Gilbert     /*
379329c9b10SDr. David Alan Gilbert      * Description of ram blocks used throughout the code.
380329c9b10SDr. David Alan Gilbert      */
381329c9b10SDr. David Alan Gilbert     RDMALocalBlocks local_ram_blocks;
382a97270adSDr. David Alan Gilbert     RDMADestBlock  *dest_blocks;
383329c9b10SDr. David Alan Gilbert 
384e4d63320SDr. David Alan Gilbert     /* Index of the next RAMBlock received during block registration */
385e4d63320SDr. David Alan Gilbert     unsigned int    next_src_index;
386e4d63320SDr. David Alan Gilbert 
387329c9b10SDr. David Alan Gilbert     /*
388329c9b10SDr. David Alan Gilbert      * Migration on *destination* started.
389329c9b10SDr. David Alan Gilbert      * Then use coroutine yield function.
390329c9b10SDr. David Alan Gilbert      * Source runs in a thread, so we don't care.
391329c9b10SDr. David Alan Gilbert      */
392329c9b10SDr. David Alan Gilbert     int migration_started_on_destination;
393329c9b10SDr. David Alan Gilbert 
394329c9b10SDr. David Alan Gilbert     int total_registrations;
395329c9b10SDr. David Alan Gilbert     int total_writes;
396329c9b10SDr. David Alan Gilbert 
397329c9b10SDr. David Alan Gilbert     int unregister_current, unregister_next;
398329c9b10SDr. David Alan Gilbert     uint64_t unregistrations[RDMA_SIGNALED_SEND_MAX];
399329c9b10SDr. David Alan Gilbert 
400329c9b10SDr. David Alan Gilbert     GHashTable *blockmap;
40155cc1b59SLidong Chen 
40255cc1b59SLidong Chen     /* the RDMAContext for return path */
40355cc1b59SLidong Chen     struct RDMAContext *return_path;
40455cc1b59SLidong Chen     bool is_return_path;
405329c9b10SDr. David Alan Gilbert } RDMAContext;
406329c9b10SDr. David Alan Gilbert 
4076ddd2d76SDaniel P. Berrange #define TYPE_QIO_CHANNEL_RDMA "qio-channel-rdma"
4088063396bSEduardo Habkost OBJECT_DECLARE_SIMPLE_TYPE(QIOChannelRDMA, QIO_CHANNEL_RDMA)
4096ddd2d76SDaniel P. Berrange 
4106ddd2d76SDaniel P. Berrange 
4116ddd2d76SDaniel P. Berrange 
4126ddd2d76SDaniel P. Berrange struct QIOChannelRDMA {
4136ddd2d76SDaniel P. Berrange     QIOChannel parent;
41474637e6fSLidong Chen     RDMAContext *rdmain;
41574637e6fSLidong Chen     RDMAContext *rdmaout;
4166ddd2d76SDaniel P. Berrange     QEMUFile *file;
4176ddd2d76SDaniel P. Berrange     bool blocking; /* XXX we don't actually honour this yet */
4186ddd2d76SDaniel P. Berrange };
419329c9b10SDr. David Alan Gilbert 
420329c9b10SDr. David Alan Gilbert /*
421329c9b10SDr. David Alan Gilbert  * Main structure for IB Send/Recv control messages.
422329c9b10SDr. David Alan Gilbert  * This gets prepended at the beginning of every Send/Recv.
423329c9b10SDr. David Alan Gilbert  */
424329c9b10SDr. David Alan Gilbert typedef struct QEMU_PACKED {
425329c9b10SDr. David Alan Gilbert     uint32_t len;     /* Total length of data portion */
426329c9b10SDr. David Alan Gilbert     uint32_t type;    /* which control command to perform */
427329c9b10SDr. David Alan Gilbert     uint32_t repeat;  /* number of commands in data portion of same type */
428329c9b10SDr. David Alan Gilbert     uint32_t padding;
429329c9b10SDr. David Alan Gilbert } RDMAControlHeader;
430329c9b10SDr. David Alan Gilbert 
431329c9b10SDr. David Alan Gilbert static void control_to_network(RDMAControlHeader *control)
432329c9b10SDr. David Alan Gilbert {
433329c9b10SDr. David Alan Gilbert     control->type = htonl(control->type);
434329c9b10SDr. David Alan Gilbert     control->len = htonl(control->len);
435329c9b10SDr. David Alan Gilbert     control->repeat = htonl(control->repeat);
436329c9b10SDr. David Alan Gilbert }
437329c9b10SDr. David Alan Gilbert 
438329c9b10SDr. David Alan Gilbert static void network_to_control(RDMAControlHeader *control)
439329c9b10SDr. David Alan Gilbert {
440329c9b10SDr. David Alan Gilbert     control->type = ntohl(control->type);
441329c9b10SDr. David Alan Gilbert     control->len = ntohl(control->len);
442329c9b10SDr. David Alan Gilbert     control->repeat = ntohl(control->repeat);
443329c9b10SDr. David Alan Gilbert }
444329c9b10SDr. David Alan Gilbert 
445329c9b10SDr. David Alan Gilbert /*
446329c9b10SDr. David Alan Gilbert  * Register a single Chunk.
447329c9b10SDr. David Alan Gilbert  * Information sent by the source VM to inform the dest
448329c9b10SDr. David Alan Gilbert  * to register an single chunk of memory before we can perform
449329c9b10SDr. David Alan Gilbert  * the actual RDMA operation.
450329c9b10SDr. David Alan Gilbert  */
451329c9b10SDr. David Alan Gilbert typedef struct QEMU_PACKED {
452329c9b10SDr. David Alan Gilbert     union QEMU_PACKED {
453b12f7777SDr. David Alan Gilbert         uint64_t current_addr;  /* offset into the ram_addr_t space */
454329c9b10SDr. David Alan Gilbert         uint64_t chunk;         /* chunk to lookup if unregistering */
455329c9b10SDr. David Alan Gilbert     } key;
456329c9b10SDr. David Alan Gilbert     uint32_t current_index; /* which ramblock the chunk belongs to */
457329c9b10SDr. David Alan Gilbert     uint32_t padding;
458329c9b10SDr. David Alan Gilbert     uint64_t chunks;            /* how many sequential chunks to register */
459329c9b10SDr. David Alan Gilbert } RDMARegister;
460329c9b10SDr. David Alan Gilbert 
461b12f7777SDr. David Alan Gilbert static void register_to_network(RDMAContext *rdma, RDMARegister *reg)
462329c9b10SDr. David Alan Gilbert {
463b12f7777SDr. David Alan Gilbert     RDMALocalBlock *local_block;
464b12f7777SDr. David Alan Gilbert     local_block  = &rdma->local_ram_blocks.block[reg->current_index];
465b12f7777SDr. David Alan Gilbert 
466b12f7777SDr. David Alan Gilbert     if (local_block->is_ram_block) {
467b12f7777SDr. David Alan Gilbert         /*
468b12f7777SDr. David Alan Gilbert          * current_addr as passed in is an address in the local ram_addr_t
469b12f7777SDr. David Alan Gilbert          * space, we need to translate this for the destination
470b12f7777SDr. David Alan Gilbert          */
471b12f7777SDr. David Alan Gilbert         reg->key.current_addr -= local_block->offset;
472b12f7777SDr. David Alan Gilbert         reg->key.current_addr += rdma->dest_blocks[reg->current_index].offset;
473b12f7777SDr. David Alan Gilbert     }
474329c9b10SDr. David Alan Gilbert     reg->key.current_addr = htonll(reg->key.current_addr);
475329c9b10SDr. David Alan Gilbert     reg->current_index = htonl(reg->current_index);
476329c9b10SDr. David Alan Gilbert     reg->chunks = htonll(reg->chunks);
477329c9b10SDr. David Alan Gilbert }
478329c9b10SDr. David Alan Gilbert 
479329c9b10SDr. David Alan Gilbert static void network_to_register(RDMARegister *reg)
480329c9b10SDr. David Alan Gilbert {
481329c9b10SDr. David Alan Gilbert     reg->key.current_addr = ntohll(reg->key.current_addr);
482329c9b10SDr. David Alan Gilbert     reg->current_index = ntohl(reg->current_index);
483329c9b10SDr. David Alan Gilbert     reg->chunks = ntohll(reg->chunks);
484329c9b10SDr. David Alan Gilbert }
485329c9b10SDr. David Alan Gilbert 
486329c9b10SDr. David Alan Gilbert typedef struct QEMU_PACKED {
487329c9b10SDr. David Alan Gilbert     uint32_t value;     /* if zero, we will madvise() */
488329c9b10SDr. David Alan Gilbert     uint32_t block_idx; /* which ram block index */
489b12f7777SDr. David Alan Gilbert     uint64_t offset;    /* Address in remote ram_addr_t space */
490329c9b10SDr. David Alan Gilbert     uint64_t length;    /* length of the chunk */
491329c9b10SDr. David Alan Gilbert } RDMACompress;
492329c9b10SDr. David Alan Gilbert 
493b12f7777SDr. David Alan Gilbert static void compress_to_network(RDMAContext *rdma, RDMACompress *comp)
494329c9b10SDr. David Alan Gilbert {
495329c9b10SDr. David Alan Gilbert     comp->value = htonl(comp->value);
496b12f7777SDr. David Alan Gilbert     /*
497b12f7777SDr. David Alan Gilbert      * comp->offset as passed in is an address in the local ram_addr_t
498b12f7777SDr. David Alan Gilbert      * space, we need to translate this for the destination
499b12f7777SDr. David Alan Gilbert      */
500b12f7777SDr. David Alan Gilbert     comp->offset -= rdma->local_ram_blocks.block[comp->block_idx].offset;
501b12f7777SDr. David Alan Gilbert     comp->offset += rdma->dest_blocks[comp->block_idx].offset;
502329c9b10SDr. David Alan Gilbert     comp->block_idx = htonl(comp->block_idx);
503329c9b10SDr. David Alan Gilbert     comp->offset = htonll(comp->offset);
504329c9b10SDr. David Alan Gilbert     comp->length = htonll(comp->length);
505329c9b10SDr. David Alan Gilbert }
506329c9b10SDr. David Alan Gilbert 
507329c9b10SDr. David Alan Gilbert static void network_to_compress(RDMACompress *comp)
508329c9b10SDr. David Alan Gilbert {
509329c9b10SDr. David Alan Gilbert     comp->value = ntohl(comp->value);
510329c9b10SDr. David Alan Gilbert     comp->block_idx = ntohl(comp->block_idx);
511329c9b10SDr. David Alan Gilbert     comp->offset = ntohll(comp->offset);
512329c9b10SDr. David Alan Gilbert     comp->length = ntohll(comp->length);
513329c9b10SDr. David Alan Gilbert }
514329c9b10SDr. David Alan Gilbert 
515329c9b10SDr. David Alan Gilbert /*
516329c9b10SDr. David Alan Gilbert  * The result of the dest's memory registration produces an "rkey"
517329c9b10SDr. David Alan Gilbert  * which the source VM must reference in order to perform
518329c9b10SDr. David Alan Gilbert  * the RDMA operation.
519329c9b10SDr. David Alan Gilbert  */
520329c9b10SDr. David Alan Gilbert typedef struct QEMU_PACKED {
521329c9b10SDr. David Alan Gilbert     uint32_t rkey;
522329c9b10SDr. David Alan Gilbert     uint32_t padding;
523329c9b10SDr. David Alan Gilbert     uint64_t host_addr;
524329c9b10SDr. David Alan Gilbert } RDMARegisterResult;
525329c9b10SDr. David Alan Gilbert 
526329c9b10SDr. David Alan Gilbert static void result_to_network(RDMARegisterResult *result)
527329c9b10SDr. David Alan Gilbert {
528329c9b10SDr. David Alan Gilbert     result->rkey = htonl(result->rkey);
529329c9b10SDr. David Alan Gilbert     result->host_addr = htonll(result->host_addr);
530329c9b10SDr. David Alan Gilbert };
531329c9b10SDr. David Alan Gilbert 
532329c9b10SDr. David Alan Gilbert static void network_to_result(RDMARegisterResult *result)
533329c9b10SDr. David Alan Gilbert {
534329c9b10SDr. David Alan Gilbert     result->rkey = ntohl(result->rkey);
535329c9b10SDr. David Alan Gilbert     result->host_addr = ntohll(result->host_addr);
536329c9b10SDr. David Alan Gilbert };
537329c9b10SDr. David Alan Gilbert 
538329c9b10SDr. David Alan Gilbert const char *print_wrid(int wrid);
539329c9b10SDr. David Alan Gilbert static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
540329c9b10SDr. David Alan Gilbert                                    uint8_t *data, RDMAControlHeader *resp,
541329c9b10SDr. David Alan Gilbert                                    int *resp_idx,
542329c9b10SDr. David Alan Gilbert                                    int (*callback)(RDMAContext *rdma));
543329c9b10SDr. David Alan Gilbert 
544329c9b10SDr. David Alan Gilbert static inline uint64_t ram_chunk_index(const uint8_t *start,
545329c9b10SDr. David Alan Gilbert                                        const uint8_t *host)
546329c9b10SDr. David Alan Gilbert {
547329c9b10SDr. David Alan Gilbert     return ((uintptr_t) host - (uintptr_t) start) >> RDMA_REG_CHUNK_SHIFT;
548329c9b10SDr. David Alan Gilbert }
549329c9b10SDr. David Alan Gilbert 
550329c9b10SDr. David Alan Gilbert static inline uint8_t *ram_chunk_start(const RDMALocalBlock *rdma_ram_block,
551329c9b10SDr. David Alan Gilbert                                        uint64_t i)
552329c9b10SDr. David Alan Gilbert {
553fbce8c25SStefan Weil     return (uint8_t *)(uintptr_t)(rdma_ram_block->local_host_addr +
554fbce8c25SStefan Weil                                   (i << RDMA_REG_CHUNK_SHIFT));
555329c9b10SDr. David Alan Gilbert }
556329c9b10SDr. David Alan Gilbert 
557329c9b10SDr. David Alan Gilbert static inline uint8_t *ram_chunk_end(const RDMALocalBlock *rdma_ram_block,
558329c9b10SDr. David Alan Gilbert                                      uint64_t i)
559329c9b10SDr. David Alan Gilbert {
560329c9b10SDr. David Alan Gilbert     uint8_t *result = ram_chunk_start(rdma_ram_block, i) +
561329c9b10SDr. David Alan Gilbert                                          (1UL << RDMA_REG_CHUNK_SHIFT);
562329c9b10SDr. David Alan Gilbert 
563329c9b10SDr. David Alan Gilbert     if (result > (rdma_ram_block->local_host_addr + rdma_ram_block->length)) {
564329c9b10SDr. David Alan Gilbert         result = rdma_ram_block->local_host_addr + rdma_ram_block->length;
565329c9b10SDr. David Alan Gilbert     }
566329c9b10SDr. David Alan Gilbert 
567329c9b10SDr. David Alan Gilbert     return result;
568329c9b10SDr. David Alan Gilbert }
569329c9b10SDr. David Alan Gilbert 
5704fb5364bSDr. David Alan Gilbert static int rdma_add_block(RDMAContext *rdma, const char *block_name,
5714fb5364bSDr. David Alan Gilbert                          void *host_addr,
572329c9b10SDr. David Alan Gilbert                          ram_addr_t block_offset, uint64_t length)
573329c9b10SDr. David Alan Gilbert {
574329c9b10SDr. David Alan Gilbert     RDMALocalBlocks *local = &rdma->local_ram_blocks;
575760ff4beSDr. David Alan Gilbert     RDMALocalBlock *block;
576329c9b10SDr. David Alan Gilbert     RDMALocalBlock *old = local->block;
577329c9b10SDr. David Alan Gilbert 
57897f3ad35SMarkus Armbruster     local->block = g_new0(RDMALocalBlock, local->nb_blocks + 1);
579329c9b10SDr. David Alan Gilbert 
580329c9b10SDr. David Alan Gilbert     if (local->nb_blocks) {
581329c9b10SDr. David Alan Gilbert         int x;
582329c9b10SDr. David Alan Gilbert 
583760ff4beSDr. David Alan Gilbert         if (rdma->blockmap) {
584329c9b10SDr. David Alan Gilbert             for (x = 0; x < local->nb_blocks; x++) {
585fbce8c25SStefan Weil                 g_hash_table_remove(rdma->blockmap,
586fbce8c25SStefan Weil                                     (void *)(uintptr_t)old[x].offset);
587fbce8c25SStefan Weil                 g_hash_table_insert(rdma->blockmap,
588fbce8c25SStefan Weil                                     (void *)(uintptr_t)old[x].offset,
589329c9b10SDr. David Alan Gilbert                                     &local->block[x]);
590329c9b10SDr. David Alan Gilbert             }
591760ff4beSDr. David Alan Gilbert         }
592329c9b10SDr. David Alan Gilbert         memcpy(local->block, old, sizeof(RDMALocalBlock) * local->nb_blocks);
593329c9b10SDr. David Alan Gilbert         g_free(old);
594329c9b10SDr. David Alan Gilbert     }
595329c9b10SDr. David Alan Gilbert 
596329c9b10SDr. David Alan Gilbert     block = &local->block[local->nb_blocks];
597329c9b10SDr. David Alan Gilbert 
5984fb5364bSDr. David Alan Gilbert     block->block_name = g_strdup(block_name);
599329c9b10SDr. David Alan Gilbert     block->local_host_addr = host_addr;
600329c9b10SDr. David Alan Gilbert     block->offset = block_offset;
601329c9b10SDr. David Alan Gilbert     block->length = length;
602329c9b10SDr. David Alan Gilbert     block->index = local->nb_blocks;
603e4d63320SDr. David Alan Gilbert     block->src_index = ~0U; /* Filled in by the receipt of the block list */
604329c9b10SDr. David Alan Gilbert     block->nb_chunks = ram_chunk_index(host_addr, host_addr + length) + 1UL;
605329c9b10SDr. David Alan Gilbert     block->transit_bitmap = bitmap_new(block->nb_chunks);
606329c9b10SDr. David Alan Gilbert     bitmap_clear(block->transit_bitmap, 0, block->nb_chunks);
607329c9b10SDr. David Alan Gilbert     block->unregister_bitmap = bitmap_new(block->nb_chunks);
608329c9b10SDr. David Alan Gilbert     bitmap_clear(block->unregister_bitmap, 0, block->nb_chunks);
60997f3ad35SMarkus Armbruster     block->remote_keys = g_new0(uint32_t, block->nb_chunks);
610329c9b10SDr. David Alan Gilbert 
611329c9b10SDr. David Alan Gilbert     block->is_ram_block = local->init ? false : true;
612329c9b10SDr. David Alan Gilbert 
613760ff4beSDr. David Alan Gilbert     if (rdma->blockmap) {
61480e60c6eSJuan Quintela         g_hash_table_insert(rdma->blockmap, (void *)(uintptr_t)block_offset, block);
615760ff4beSDr. David Alan Gilbert     }
616329c9b10SDr. David Alan Gilbert 
6174fb5364bSDr. David Alan Gilbert     trace_rdma_add_block(block_name, local->nb_blocks,
6184fb5364bSDr. David Alan Gilbert                          (uintptr_t) block->local_host_addr,
619ba795761SDr. David Alan Gilbert                          block->offset, block->length,
620fbce8c25SStefan Weil                          (uintptr_t) (block->local_host_addr + block->length),
621329c9b10SDr. David Alan Gilbert                          BITS_TO_LONGS(block->nb_chunks) *
622733252deSDr. David Alan Gilbert                              sizeof(unsigned long) * 8,
623733252deSDr. David Alan Gilbert                          block->nb_chunks);
624329c9b10SDr. David Alan Gilbert 
625329c9b10SDr. David Alan Gilbert     local->nb_blocks++;
626329c9b10SDr. David Alan Gilbert 
627329c9b10SDr. David Alan Gilbert     return 0;
628329c9b10SDr. David Alan Gilbert }
629329c9b10SDr. David Alan Gilbert 
630329c9b10SDr. David Alan Gilbert /*
631329c9b10SDr. David Alan Gilbert  * Memory regions need to be registered with the device and queue pairs setup
632329c9b10SDr. David Alan Gilbert  * in advanced before the migration starts. This tells us where the RAM blocks
633329c9b10SDr. David Alan Gilbert  * are so that we can register them individually.
634329c9b10SDr. David Alan Gilbert  */
635754cb9c0SYury Kotov static int qemu_rdma_init_one_block(RAMBlock *rb, void *opaque)
636329c9b10SDr. David Alan Gilbert {
637754cb9c0SYury Kotov     const char *block_name = qemu_ram_get_idstr(rb);
638754cb9c0SYury Kotov     void *host_addr = qemu_ram_get_host_addr(rb);
639754cb9c0SYury Kotov     ram_addr_t block_offset = qemu_ram_get_offset(rb);
640754cb9c0SYury Kotov     ram_addr_t length = qemu_ram_get_used_length(rb);
6414fb5364bSDr. David Alan Gilbert     return rdma_add_block(opaque, block_name, host_addr, block_offset, length);
642329c9b10SDr. David Alan Gilbert }
643329c9b10SDr. David Alan Gilbert 
644329c9b10SDr. David Alan Gilbert /*
645329c9b10SDr. David Alan Gilbert  * Identify the RAMBlocks and their quantity. They will be references to
646329c9b10SDr. David Alan Gilbert  * identify chunk boundaries inside each RAMBlock and also be referenced
647329c9b10SDr. David Alan Gilbert  * during dynamic page registration.
648329c9b10SDr. David Alan Gilbert  */
649329c9b10SDr. David Alan Gilbert static int qemu_rdma_init_ram_blocks(RDMAContext *rdma)
650329c9b10SDr. David Alan Gilbert {
651329c9b10SDr. David Alan Gilbert     RDMALocalBlocks *local = &rdma->local_ram_blocks;
652281496bbSDr. David Alan Gilbert     int ret;
653329c9b10SDr. David Alan Gilbert 
654329c9b10SDr. David Alan Gilbert     assert(rdma->blockmap == NULL);
655329c9b10SDr. David Alan Gilbert     memset(local, 0, sizeof *local);
656281496bbSDr. David Alan Gilbert     ret = foreach_not_ignored_block(qemu_rdma_init_one_block, rdma);
657281496bbSDr. David Alan Gilbert     if (ret) {
658281496bbSDr. David Alan Gilbert         return ret;
659281496bbSDr. David Alan Gilbert     }
660733252deSDr. David Alan Gilbert     trace_qemu_rdma_init_ram_blocks(local->nb_blocks);
66197f3ad35SMarkus Armbruster     rdma->dest_blocks = g_new0(RDMADestBlock,
662329c9b10SDr. David Alan Gilbert                                rdma->local_ram_blocks.nb_blocks);
663329c9b10SDr. David Alan Gilbert     local->init = true;
664329c9b10SDr. David Alan Gilbert     return 0;
665329c9b10SDr. David Alan Gilbert }
666329c9b10SDr. David Alan Gilbert 
66703fcab38SDr. David Alan Gilbert /*
66803fcab38SDr. David Alan Gilbert  * Note: If used outside of cleanup, the caller must ensure that the destination
66903fcab38SDr. David Alan Gilbert  * block structures are also updated
67003fcab38SDr. David Alan Gilbert  */
67103fcab38SDr. David Alan Gilbert static int rdma_delete_block(RDMAContext *rdma, RDMALocalBlock *block)
672329c9b10SDr. David Alan Gilbert {
673329c9b10SDr. David Alan Gilbert     RDMALocalBlocks *local = &rdma->local_ram_blocks;
674329c9b10SDr. David Alan Gilbert     RDMALocalBlock *old = local->block;
675329c9b10SDr. David Alan Gilbert     int x;
676329c9b10SDr. David Alan Gilbert 
67703fcab38SDr. David Alan Gilbert     if (rdma->blockmap) {
67803fcab38SDr. David Alan Gilbert         g_hash_table_remove(rdma->blockmap, (void *)(uintptr_t)block->offset);
67903fcab38SDr. David Alan Gilbert     }
680329c9b10SDr. David Alan Gilbert     if (block->pmr) {
681329c9b10SDr. David Alan Gilbert         int j;
682329c9b10SDr. David Alan Gilbert 
683329c9b10SDr. David Alan Gilbert         for (j = 0; j < block->nb_chunks; j++) {
684329c9b10SDr. David Alan Gilbert             if (!block->pmr[j]) {
685329c9b10SDr. David Alan Gilbert                 continue;
686329c9b10SDr. David Alan Gilbert             }
687329c9b10SDr. David Alan Gilbert             ibv_dereg_mr(block->pmr[j]);
688329c9b10SDr. David Alan Gilbert             rdma->total_registrations--;
689329c9b10SDr. David Alan Gilbert         }
690329c9b10SDr. David Alan Gilbert         g_free(block->pmr);
691329c9b10SDr. David Alan Gilbert         block->pmr = NULL;
692329c9b10SDr. David Alan Gilbert     }
693329c9b10SDr. David Alan Gilbert 
694329c9b10SDr. David Alan Gilbert     if (block->mr) {
695329c9b10SDr. David Alan Gilbert         ibv_dereg_mr(block->mr);
696329c9b10SDr. David Alan Gilbert         rdma->total_registrations--;
697329c9b10SDr. David Alan Gilbert         block->mr = NULL;
698329c9b10SDr. David Alan Gilbert     }
699329c9b10SDr. David Alan Gilbert 
700329c9b10SDr. David Alan Gilbert     g_free(block->transit_bitmap);
701329c9b10SDr. David Alan Gilbert     block->transit_bitmap = NULL;
702329c9b10SDr. David Alan Gilbert 
703329c9b10SDr. David Alan Gilbert     g_free(block->unregister_bitmap);
704329c9b10SDr. David Alan Gilbert     block->unregister_bitmap = NULL;
705329c9b10SDr. David Alan Gilbert 
706329c9b10SDr. David Alan Gilbert     g_free(block->remote_keys);
707329c9b10SDr. David Alan Gilbert     block->remote_keys = NULL;
708329c9b10SDr. David Alan Gilbert 
7094fb5364bSDr. David Alan Gilbert     g_free(block->block_name);
7104fb5364bSDr. David Alan Gilbert     block->block_name = NULL;
7114fb5364bSDr. David Alan Gilbert 
71203fcab38SDr. David Alan Gilbert     if (rdma->blockmap) {
713329c9b10SDr. David Alan Gilbert         for (x = 0; x < local->nb_blocks; x++) {
71403fcab38SDr. David Alan Gilbert             g_hash_table_remove(rdma->blockmap,
71503fcab38SDr. David Alan Gilbert                                 (void *)(uintptr_t)old[x].offset);
71603fcab38SDr. David Alan Gilbert         }
717329c9b10SDr. David Alan Gilbert     }
718329c9b10SDr. David Alan Gilbert 
719329c9b10SDr. David Alan Gilbert     if (local->nb_blocks > 1) {
720329c9b10SDr. David Alan Gilbert 
72197f3ad35SMarkus Armbruster         local->block = g_new0(RDMALocalBlock, local->nb_blocks - 1);
722329c9b10SDr. David Alan Gilbert 
723329c9b10SDr. David Alan Gilbert         if (block->index) {
724329c9b10SDr. David Alan Gilbert             memcpy(local->block, old, sizeof(RDMALocalBlock) * block->index);
725329c9b10SDr. David Alan Gilbert         }
726329c9b10SDr. David Alan Gilbert 
727329c9b10SDr. David Alan Gilbert         if (block->index < (local->nb_blocks - 1)) {
728329c9b10SDr. David Alan Gilbert             memcpy(local->block + block->index, old + (block->index + 1),
729329c9b10SDr. David Alan Gilbert                 sizeof(RDMALocalBlock) *
730329c9b10SDr. David Alan Gilbert                     (local->nb_blocks - (block->index + 1)));
73171cd7306SLidong Chen             for (x = block->index; x < local->nb_blocks - 1; x++) {
73271cd7306SLidong Chen                 local->block[x].index--;
73371cd7306SLidong Chen             }
734329c9b10SDr. David Alan Gilbert         }
735329c9b10SDr. David Alan Gilbert     } else {
736329c9b10SDr. David Alan Gilbert         assert(block == local->block);
737329c9b10SDr. David Alan Gilbert         local->block = NULL;
738329c9b10SDr. David Alan Gilbert     }
739329c9b10SDr. David Alan Gilbert 
74003fcab38SDr. David Alan Gilbert     trace_rdma_delete_block(block, (uintptr_t)block->local_host_addr,
741733252deSDr. David Alan Gilbert                            block->offset, block->length,
742fbce8c25SStefan Weil                             (uintptr_t)(block->local_host_addr + block->length),
743329c9b10SDr. David Alan Gilbert                            BITS_TO_LONGS(block->nb_chunks) *
744329c9b10SDr. David Alan Gilbert                                sizeof(unsigned long) * 8, block->nb_chunks);
745329c9b10SDr. David Alan Gilbert 
746329c9b10SDr. David Alan Gilbert     g_free(old);
747329c9b10SDr. David Alan Gilbert 
748329c9b10SDr. David Alan Gilbert     local->nb_blocks--;
749329c9b10SDr. David Alan Gilbert 
75003fcab38SDr. David Alan Gilbert     if (local->nb_blocks && rdma->blockmap) {
751329c9b10SDr. David Alan Gilbert         for (x = 0; x < local->nb_blocks; x++) {
752fbce8c25SStefan Weil             g_hash_table_insert(rdma->blockmap,
753fbce8c25SStefan Weil                                 (void *)(uintptr_t)local->block[x].offset,
754329c9b10SDr. David Alan Gilbert                                 &local->block[x]);
755329c9b10SDr. David Alan Gilbert         }
756329c9b10SDr. David Alan Gilbert     }
757329c9b10SDr. David Alan Gilbert 
758329c9b10SDr. David Alan Gilbert     return 0;
759329c9b10SDr. David Alan Gilbert }
760329c9b10SDr. David Alan Gilbert 
761329c9b10SDr. David Alan Gilbert /*
762329c9b10SDr. David Alan Gilbert  * Put in the log file which RDMA device was opened and the details
763329c9b10SDr. David Alan Gilbert  * associated with that device.
764329c9b10SDr. David Alan Gilbert  */
765329c9b10SDr. David Alan Gilbert static void qemu_rdma_dump_id(const char *who, struct ibv_context *verbs)
766329c9b10SDr. David Alan Gilbert {
767329c9b10SDr. David Alan Gilbert     struct ibv_port_attr port;
768329c9b10SDr. David Alan Gilbert 
769329c9b10SDr. David Alan Gilbert     if (ibv_query_port(verbs, 1, &port)) {
770733252deSDr. David Alan Gilbert         error_report("Failed to query port information");
771329c9b10SDr. David Alan Gilbert         return;
772329c9b10SDr. David Alan Gilbert     }
773329c9b10SDr. David Alan Gilbert 
774329c9b10SDr. David Alan Gilbert     printf("%s RDMA Device opened: kernel name %s "
775329c9b10SDr. David Alan Gilbert            "uverbs device name %s, "
776329c9b10SDr. David Alan Gilbert            "infiniband_verbs class device path %s, "
777329c9b10SDr. David Alan Gilbert            "infiniband class device path %s, "
778329c9b10SDr. David Alan Gilbert            "transport: (%d) %s\n",
779329c9b10SDr. David Alan Gilbert                 who,
780329c9b10SDr. David Alan Gilbert                 verbs->device->name,
781329c9b10SDr. David Alan Gilbert                 verbs->device->dev_name,
782329c9b10SDr. David Alan Gilbert                 verbs->device->dev_path,
783329c9b10SDr. David Alan Gilbert                 verbs->device->ibdev_path,
784329c9b10SDr. David Alan Gilbert                 port.link_layer,
785329c9b10SDr. David Alan Gilbert                 (port.link_layer == IBV_LINK_LAYER_INFINIBAND) ? "Infiniband" :
786329c9b10SDr. David Alan Gilbert                  ((port.link_layer == IBV_LINK_LAYER_ETHERNET)
787329c9b10SDr. David Alan Gilbert                     ? "Ethernet" : "Unknown"));
788329c9b10SDr. David Alan Gilbert }
789329c9b10SDr. David Alan Gilbert 
790329c9b10SDr. David Alan Gilbert /*
791329c9b10SDr. David Alan Gilbert  * Put in the log file the RDMA gid addressing information,
792329c9b10SDr. David Alan Gilbert  * useful for folks who have trouble understanding the
793329c9b10SDr. David Alan Gilbert  * RDMA device hierarchy in the kernel.
794329c9b10SDr. David Alan Gilbert  */
795329c9b10SDr. David Alan Gilbert static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id)
796329c9b10SDr. David Alan Gilbert {
797329c9b10SDr. David Alan Gilbert     char sgid[33];
798329c9b10SDr. David Alan Gilbert     char dgid[33];
799329c9b10SDr. David Alan Gilbert     inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.sgid, sgid, sizeof sgid);
800329c9b10SDr. David Alan Gilbert     inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.dgid, dgid, sizeof dgid);
801733252deSDr. David Alan Gilbert     trace_qemu_rdma_dump_gid(who, sgid, dgid);
802329c9b10SDr. David Alan Gilbert }
803329c9b10SDr. David Alan Gilbert 
804329c9b10SDr. David Alan Gilbert /*
805329c9b10SDr. David Alan Gilbert  * As of now, IPv6 over RoCE / iWARP is not supported by linux.
806329c9b10SDr. David Alan Gilbert  * We will try the next addrinfo struct, and fail if there are
807329c9b10SDr. David Alan Gilbert  * no other valid addresses to bind against.
808329c9b10SDr. David Alan Gilbert  *
809329c9b10SDr. David Alan Gilbert  * If user is listening on '[::]', then we will not have a opened a device
810329c9b10SDr. David Alan Gilbert  * yet and have no way of verifying if the device is RoCE or not.
811329c9b10SDr. David Alan Gilbert  *
812329c9b10SDr. David Alan Gilbert  * In this case, the source VM will throw an error for ALL types of
813329c9b10SDr. David Alan Gilbert  * connections (both IPv4 and IPv6) if the destination machine does not have
814329c9b10SDr. David Alan Gilbert  * a regular infiniband network available for use.
815329c9b10SDr. David Alan Gilbert  *
816329c9b10SDr. David Alan Gilbert  * The only way to guarantee that an error is thrown for broken kernels is
817329c9b10SDr. David Alan Gilbert  * for the management software to choose a *specific* interface at bind time
818329c9b10SDr. David Alan Gilbert  * and validate what time of hardware it is.
819329c9b10SDr. David Alan Gilbert  *
820329c9b10SDr. David Alan Gilbert  * Unfortunately, this puts the user in a fix:
821329c9b10SDr. David Alan Gilbert  *
822329c9b10SDr. David Alan Gilbert  *  If the source VM connects with an IPv4 address without knowing that the
823329c9b10SDr. David Alan Gilbert  *  destination has bound to '[::]' the migration will unconditionally fail
824b6af0975SDaniel P. Berrange  *  unless the management software is explicitly listening on the IPv4
825329c9b10SDr. David Alan Gilbert  *  address while using a RoCE-based device.
826329c9b10SDr. David Alan Gilbert  *
827329c9b10SDr. David Alan Gilbert  *  If the source VM connects with an IPv6 address, then we're OK because we can
828329c9b10SDr. David Alan Gilbert  *  throw an error on the source (and similarly on the destination).
829329c9b10SDr. David Alan Gilbert  *
830329c9b10SDr. David Alan Gilbert  *  But in mixed environments, this will be broken for a while until it is fixed
831329c9b10SDr. David Alan Gilbert  *  inside linux.
832329c9b10SDr. David Alan Gilbert  *
833329c9b10SDr. David Alan Gilbert  * We do provide a *tiny* bit of help in this function: We can list all of the
834329c9b10SDr. David Alan Gilbert  * devices in the system and check to see if all the devices are RoCE or
835329c9b10SDr. David Alan Gilbert  * Infiniband.
836329c9b10SDr. David Alan Gilbert  *
837329c9b10SDr. David Alan Gilbert  * If we detect that we have a *pure* RoCE environment, then we can safely
838329c9b10SDr. David Alan Gilbert  * thrown an error even if the management software has specified '[::]' as the
839329c9b10SDr. David Alan Gilbert  * bind address.
840329c9b10SDr. David Alan Gilbert  *
841329c9b10SDr. David Alan Gilbert  * However, if there is are multiple hetergeneous devices, then we cannot make
842329c9b10SDr. David Alan Gilbert  * this assumption and the user just has to be sure they know what they are
843329c9b10SDr. David Alan Gilbert  * doing.
844329c9b10SDr. David Alan Gilbert  *
845329c9b10SDr. David Alan Gilbert  * Patches are being reviewed on linux-rdma.
846329c9b10SDr. David Alan Gilbert  */
847bbfb89e3SFam Zheng static int qemu_rdma_broken_ipv6_kernel(struct ibv_context *verbs, Error **errp)
848329c9b10SDr. David Alan Gilbert {
849329c9b10SDr. David Alan Gilbert     /* This bug only exists in linux, to our knowledge. */
850329c9b10SDr. David Alan Gilbert #ifdef CONFIG_LINUX
8511f4abd81SAlex Bennée     struct ibv_port_attr port_attr;
852329c9b10SDr. David Alan Gilbert 
853329c9b10SDr. David Alan Gilbert     /*
854329c9b10SDr. David Alan Gilbert      * Verbs are only NULL if management has bound to '[::]'.
855329c9b10SDr. David Alan Gilbert      *
856329c9b10SDr. David Alan Gilbert      * Let's iterate through all the devices and see if there any pure IB
857329c9b10SDr. David Alan Gilbert      * devices (non-ethernet).
858329c9b10SDr. David Alan Gilbert      *
859329c9b10SDr. David Alan Gilbert      * If not, then we can safely proceed with the migration.
860329c9b10SDr. David Alan Gilbert      * Otherwise, there are no guarantees until the bug is fixed in linux.
861329c9b10SDr. David Alan Gilbert      */
862329c9b10SDr. David Alan Gilbert     if (!verbs) {
863329c9b10SDr. David Alan Gilbert         int num_devices, x;
864329c9b10SDr. David Alan Gilbert         struct ibv_device **dev_list = ibv_get_device_list(&num_devices);
865329c9b10SDr. David Alan Gilbert         bool roce_found = false;
866329c9b10SDr. David Alan Gilbert         bool ib_found = false;
867329c9b10SDr. David Alan Gilbert 
868329c9b10SDr. David Alan Gilbert         for (x = 0; x < num_devices; x++) {
869329c9b10SDr. David Alan Gilbert             verbs = ibv_open_device(dev_list[x]);
8705b61d575SPadmanabh Ratnakar             if (!verbs) {
8715b61d575SPadmanabh Ratnakar                 if (errno == EPERM) {
8725b61d575SPadmanabh Ratnakar                     continue;
8735b61d575SPadmanabh Ratnakar                 } else {
8745b61d575SPadmanabh Ratnakar                     return -EINVAL;
8755b61d575SPadmanabh Ratnakar                 }
8765b61d575SPadmanabh Ratnakar             }
877329c9b10SDr. David Alan Gilbert 
878329c9b10SDr. David Alan Gilbert             if (ibv_query_port(verbs, 1, &port_attr)) {
879329c9b10SDr. David Alan Gilbert                 ibv_close_device(verbs);
880329c9b10SDr. David Alan Gilbert                 ERROR(errp, "Could not query initial IB port");
881329c9b10SDr. David Alan Gilbert                 return -EINVAL;
882329c9b10SDr. David Alan Gilbert             }
883329c9b10SDr. David Alan Gilbert 
884329c9b10SDr. David Alan Gilbert             if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) {
885329c9b10SDr. David Alan Gilbert                 ib_found = true;
886329c9b10SDr. David Alan Gilbert             } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
887329c9b10SDr. David Alan Gilbert                 roce_found = true;
888329c9b10SDr. David Alan Gilbert             }
889329c9b10SDr. David Alan Gilbert 
890329c9b10SDr. David Alan Gilbert             ibv_close_device(verbs);
891329c9b10SDr. David Alan Gilbert 
892329c9b10SDr. David Alan Gilbert         }
893329c9b10SDr. David Alan Gilbert 
894329c9b10SDr. David Alan Gilbert         if (roce_found) {
895329c9b10SDr. David Alan Gilbert             if (ib_found) {
896329c9b10SDr. David Alan Gilbert                 fprintf(stderr, "WARN: migrations may fail:"
897329c9b10SDr. David Alan Gilbert                                 " IPv6 over RoCE / iWARP in linux"
898329c9b10SDr. David Alan Gilbert                                 " is broken. But since you appear to have a"
899329c9b10SDr. David Alan Gilbert                                 " mixed RoCE / IB environment, be sure to only"
900329c9b10SDr. David Alan Gilbert                                 " migrate over the IB fabric until the kernel "
901329c9b10SDr. David Alan Gilbert                                 " fixes the bug.\n");
902329c9b10SDr. David Alan Gilbert             } else {
903329c9b10SDr. David Alan Gilbert                 ERROR(errp, "You only have RoCE / iWARP devices in your systems"
904329c9b10SDr. David Alan Gilbert                             " and your management software has specified '[::]'"
905329c9b10SDr. David Alan Gilbert                             ", but IPv6 over RoCE / iWARP is not supported in Linux.");
906329c9b10SDr. David Alan Gilbert                 return -ENONET;
907329c9b10SDr. David Alan Gilbert             }
908329c9b10SDr. David Alan Gilbert         }
909329c9b10SDr. David Alan Gilbert 
910329c9b10SDr. David Alan Gilbert         return 0;
911329c9b10SDr. David Alan Gilbert     }
912329c9b10SDr. David Alan Gilbert 
913329c9b10SDr. David Alan Gilbert     /*
914329c9b10SDr. David Alan Gilbert      * If we have a verbs context, that means that some other than '[::]' was
91502942db7SStefan Weil      * used by the management software for binding. In which case we can
91602942db7SStefan Weil      * actually warn the user about a potentially broken kernel.
917329c9b10SDr. David Alan Gilbert      */
918329c9b10SDr. David Alan Gilbert 
919329c9b10SDr. David Alan Gilbert     /* IB ports start with 1, not 0 */
920329c9b10SDr. David Alan Gilbert     if (ibv_query_port(verbs, 1, &port_attr)) {
921329c9b10SDr. David Alan Gilbert         ERROR(errp, "Could not query initial IB port");
922329c9b10SDr. David Alan Gilbert         return -EINVAL;
923329c9b10SDr. David Alan Gilbert     }
924329c9b10SDr. David Alan Gilbert 
925329c9b10SDr. David Alan Gilbert     if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
926329c9b10SDr. David Alan Gilbert         ERROR(errp, "Linux kernel's RoCE / iWARP does not support IPv6 "
927329c9b10SDr. David Alan Gilbert                     "(but patches on linux-rdma in progress)");
928329c9b10SDr. David Alan Gilbert         return -ENONET;
929329c9b10SDr. David Alan Gilbert     }
930329c9b10SDr. David Alan Gilbert 
931329c9b10SDr. David Alan Gilbert #endif
932329c9b10SDr. David Alan Gilbert 
933329c9b10SDr. David Alan Gilbert     return 0;
934329c9b10SDr. David Alan Gilbert }
935329c9b10SDr. David Alan Gilbert 
936329c9b10SDr. David Alan Gilbert /*
937329c9b10SDr. David Alan Gilbert  * Figure out which RDMA device corresponds to the requested IP hostname
938329c9b10SDr. David Alan Gilbert  * Also create the initial connection manager identifiers for opening
939329c9b10SDr. David Alan Gilbert  * the connection.
940329c9b10SDr. David Alan Gilbert  */
941329c9b10SDr. David Alan Gilbert static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp)
942329c9b10SDr. David Alan Gilbert {
943329c9b10SDr. David Alan Gilbert     int ret;
944329c9b10SDr. David Alan Gilbert     struct rdma_addrinfo *res;
945329c9b10SDr. David Alan Gilbert     char port_str[16];
946329c9b10SDr. David Alan Gilbert     struct rdma_cm_event *cm_event;
947329c9b10SDr. David Alan Gilbert     char ip[40] = "unknown";
948329c9b10SDr. David Alan Gilbert     struct rdma_addrinfo *e;
949329c9b10SDr. David Alan Gilbert 
950329c9b10SDr. David Alan Gilbert     if (rdma->host == NULL || !strcmp(rdma->host, "")) {
951329c9b10SDr. David Alan Gilbert         ERROR(errp, "RDMA hostname has not been set");
952329c9b10SDr. David Alan Gilbert         return -EINVAL;
953329c9b10SDr. David Alan Gilbert     }
954329c9b10SDr. David Alan Gilbert 
955329c9b10SDr. David Alan Gilbert     /* create CM channel */
956329c9b10SDr. David Alan Gilbert     rdma->channel = rdma_create_event_channel();
957329c9b10SDr. David Alan Gilbert     if (!rdma->channel) {
958329c9b10SDr. David Alan Gilbert         ERROR(errp, "could not create CM channel");
959329c9b10SDr. David Alan Gilbert         return -EINVAL;
960329c9b10SDr. David Alan Gilbert     }
961329c9b10SDr. David Alan Gilbert 
962329c9b10SDr. David Alan Gilbert     /* create CM id */
963329c9b10SDr. David Alan Gilbert     ret = rdma_create_id(rdma->channel, &rdma->cm_id, NULL, RDMA_PS_TCP);
964329c9b10SDr. David Alan Gilbert     if (ret) {
965329c9b10SDr. David Alan Gilbert         ERROR(errp, "could not create channel id");
966329c9b10SDr. David Alan Gilbert         goto err_resolve_create_id;
967329c9b10SDr. David Alan Gilbert     }
968329c9b10SDr. David Alan Gilbert 
969329c9b10SDr. David Alan Gilbert     snprintf(port_str, 16, "%d", rdma->port);
970329c9b10SDr. David Alan Gilbert     port_str[15] = '\0';
971329c9b10SDr. David Alan Gilbert 
972329c9b10SDr. David Alan Gilbert     ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
973329c9b10SDr. David Alan Gilbert     if (ret < 0) {
974329c9b10SDr. David Alan Gilbert         ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
975329c9b10SDr. David Alan Gilbert         goto err_resolve_get_addr;
976329c9b10SDr. David Alan Gilbert     }
977329c9b10SDr. David Alan Gilbert 
978329c9b10SDr. David Alan Gilbert     for (e = res; e != NULL; e = e->ai_next) {
979329c9b10SDr. David Alan Gilbert         inet_ntop(e->ai_family,
980329c9b10SDr. David Alan Gilbert             &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
981733252deSDr. David Alan Gilbert         trace_qemu_rdma_resolve_host_trying(rdma->host, ip);
982329c9b10SDr. David Alan Gilbert 
983329c9b10SDr. David Alan Gilbert         ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr,
984329c9b10SDr. David Alan Gilbert                 RDMA_RESOLVE_TIMEOUT_MS);
985329c9b10SDr. David Alan Gilbert         if (!ret) {
986329c9b10SDr. David Alan Gilbert             if (e->ai_family == AF_INET6) {
987bbfb89e3SFam Zheng                 ret = qemu_rdma_broken_ipv6_kernel(rdma->cm_id->verbs, errp);
988329c9b10SDr. David Alan Gilbert                 if (ret) {
989329c9b10SDr. David Alan Gilbert                     continue;
990329c9b10SDr. David Alan Gilbert                 }
991329c9b10SDr. David Alan Gilbert             }
992329c9b10SDr. David Alan Gilbert             goto route;
993329c9b10SDr. David Alan Gilbert         }
994329c9b10SDr. David Alan Gilbert     }
995329c9b10SDr. David Alan Gilbert 
996f53b450aSLi Zhijian     rdma_freeaddrinfo(res);
997329c9b10SDr. David Alan Gilbert     ERROR(errp, "could not resolve address %s", rdma->host);
998329c9b10SDr. David Alan Gilbert     goto err_resolve_get_addr;
999329c9b10SDr. David Alan Gilbert 
1000329c9b10SDr. David Alan Gilbert route:
1001f53b450aSLi Zhijian     rdma_freeaddrinfo(res);
1002329c9b10SDr. David Alan Gilbert     qemu_rdma_dump_gid("source_resolve_addr", rdma->cm_id);
1003329c9b10SDr. David Alan Gilbert 
1004329c9b10SDr. David Alan Gilbert     ret = rdma_get_cm_event(rdma->channel, &cm_event);
1005329c9b10SDr. David Alan Gilbert     if (ret) {
1006329c9b10SDr. David Alan Gilbert         ERROR(errp, "could not perform event_addr_resolved");
1007329c9b10SDr. David Alan Gilbert         goto err_resolve_get_addr;
1008329c9b10SDr. David Alan Gilbert     }
1009329c9b10SDr. David Alan Gilbert 
1010329c9b10SDr. David Alan Gilbert     if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
1011329c9b10SDr. David Alan Gilbert         ERROR(errp, "result not equal to event_addr_resolved %s",
1012329c9b10SDr. David Alan Gilbert                 rdma_event_str(cm_event->event));
1013e5f60791SLi Zhijian         error_report("rdma_resolve_addr");
1014329c9b10SDr. David Alan Gilbert         rdma_ack_cm_event(cm_event);
1015329c9b10SDr. David Alan Gilbert         ret = -EINVAL;
1016329c9b10SDr. David Alan Gilbert         goto err_resolve_get_addr;
1017329c9b10SDr. David Alan Gilbert     }
1018329c9b10SDr. David Alan Gilbert     rdma_ack_cm_event(cm_event);
1019329c9b10SDr. David Alan Gilbert 
1020329c9b10SDr. David Alan Gilbert     /* resolve route */
1021329c9b10SDr. David Alan Gilbert     ret = rdma_resolve_route(rdma->cm_id, RDMA_RESOLVE_TIMEOUT_MS);
1022329c9b10SDr. David Alan Gilbert     if (ret) {
1023329c9b10SDr. David Alan Gilbert         ERROR(errp, "could not resolve rdma route");
1024329c9b10SDr. David Alan Gilbert         goto err_resolve_get_addr;
1025329c9b10SDr. David Alan Gilbert     }
1026329c9b10SDr. David Alan Gilbert 
1027329c9b10SDr. David Alan Gilbert     ret = rdma_get_cm_event(rdma->channel, &cm_event);
1028329c9b10SDr. David Alan Gilbert     if (ret) {
1029329c9b10SDr. David Alan Gilbert         ERROR(errp, "could not perform event_route_resolved");
1030329c9b10SDr. David Alan Gilbert         goto err_resolve_get_addr;
1031329c9b10SDr. David Alan Gilbert     }
1032329c9b10SDr. David Alan Gilbert     if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
1033329c9b10SDr. David Alan Gilbert         ERROR(errp, "result not equal to event_route_resolved: %s",
1034329c9b10SDr. David Alan Gilbert                         rdma_event_str(cm_event->event));
1035329c9b10SDr. David Alan Gilbert         rdma_ack_cm_event(cm_event);
1036329c9b10SDr. David Alan Gilbert         ret = -EINVAL;
1037329c9b10SDr. David Alan Gilbert         goto err_resolve_get_addr;
1038329c9b10SDr. David Alan Gilbert     }
1039329c9b10SDr. David Alan Gilbert     rdma_ack_cm_event(cm_event);
1040329c9b10SDr. David Alan Gilbert     rdma->verbs = rdma->cm_id->verbs;
1041329c9b10SDr. David Alan Gilbert     qemu_rdma_dump_id("source_resolve_host", rdma->cm_id->verbs);
1042329c9b10SDr. David Alan Gilbert     qemu_rdma_dump_gid("source_resolve_host", rdma->cm_id);
1043329c9b10SDr. David Alan Gilbert     return 0;
1044329c9b10SDr. David Alan Gilbert 
1045329c9b10SDr. David Alan Gilbert err_resolve_get_addr:
1046329c9b10SDr. David Alan Gilbert     rdma_destroy_id(rdma->cm_id);
1047329c9b10SDr. David Alan Gilbert     rdma->cm_id = NULL;
1048329c9b10SDr. David Alan Gilbert err_resolve_create_id:
1049329c9b10SDr. David Alan Gilbert     rdma_destroy_event_channel(rdma->channel);
1050329c9b10SDr. David Alan Gilbert     rdma->channel = NULL;
1051329c9b10SDr. David Alan Gilbert     return ret;
1052329c9b10SDr. David Alan Gilbert }
1053329c9b10SDr. David Alan Gilbert 
1054329c9b10SDr. David Alan Gilbert /*
1055329c9b10SDr. David Alan Gilbert  * Create protection domain and completion queues
1056329c9b10SDr. David Alan Gilbert  */
1057329c9b10SDr. David Alan Gilbert static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
1058329c9b10SDr. David Alan Gilbert {
1059329c9b10SDr. David Alan Gilbert     /* allocate pd */
1060329c9b10SDr. David Alan Gilbert     rdma->pd = ibv_alloc_pd(rdma->verbs);
1061329c9b10SDr. David Alan Gilbert     if (!rdma->pd) {
1062733252deSDr. David Alan Gilbert         error_report("failed to allocate protection domain");
1063329c9b10SDr. David Alan Gilbert         return -1;
1064329c9b10SDr. David Alan Gilbert     }
1065329c9b10SDr. David Alan Gilbert 
1066b390afd8SLi Zhijian     /* create receive completion channel */
1067b390afd8SLi Zhijian     rdma->recv_comp_channel = ibv_create_comp_channel(rdma->verbs);
1068b390afd8SLi Zhijian     if (!rdma->recv_comp_channel) {
1069b390afd8SLi Zhijian         error_report("failed to allocate receive completion channel");
1070329c9b10SDr. David Alan Gilbert         goto err_alloc_pd_cq;
1071329c9b10SDr. David Alan Gilbert     }
1072329c9b10SDr. David Alan Gilbert 
1073329c9b10SDr. David Alan Gilbert     /*
1074b390afd8SLi Zhijian      * Completion queue can be filled by read work requests.
1075329c9b10SDr. David Alan Gilbert      */
1076b390afd8SLi Zhijian     rdma->recv_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
1077b390afd8SLi Zhijian                                   NULL, rdma->recv_comp_channel, 0);
1078b390afd8SLi Zhijian     if (!rdma->recv_cq) {
1079b390afd8SLi Zhijian         error_report("failed to allocate receive completion queue");
1080b390afd8SLi Zhijian         goto err_alloc_pd_cq;
1081b390afd8SLi Zhijian     }
1082b390afd8SLi Zhijian 
1083b390afd8SLi Zhijian     /* create send completion channel */
1084b390afd8SLi Zhijian     rdma->send_comp_channel = ibv_create_comp_channel(rdma->verbs);
1085b390afd8SLi Zhijian     if (!rdma->send_comp_channel) {
1086b390afd8SLi Zhijian         error_report("failed to allocate send completion channel");
1087b390afd8SLi Zhijian         goto err_alloc_pd_cq;
1088b390afd8SLi Zhijian     }
1089b390afd8SLi Zhijian 
1090b390afd8SLi Zhijian     rdma->send_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
1091b390afd8SLi Zhijian                                   NULL, rdma->send_comp_channel, 0);
1092b390afd8SLi Zhijian     if (!rdma->send_cq) {
1093b390afd8SLi Zhijian         error_report("failed to allocate send completion queue");
1094329c9b10SDr. David Alan Gilbert         goto err_alloc_pd_cq;
1095329c9b10SDr. David Alan Gilbert     }
1096329c9b10SDr. David Alan Gilbert 
1097329c9b10SDr. David Alan Gilbert     return 0;
1098329c9b10SDr. David Alan Gilbert 
1099329c9b10SDr. David Alan Gilbert err_alloc_pd_cq:
1100329c9b10SDr. David Alan Gilbert     if (rdma->pd) {
1101329c9b10SDr. David Alan Gilbert         ibv_dealloc_pd(rdma->pd);
1102329c9b10SDr. David Alan Gilbert     }
1103b390afd8SLi Zhijian     if (rdma->recv_comp_channel) {
1104b390afd8SLi Zhijian         ibv_destroy_comp_channel(rdma->recv_comp_channel);
1105b390afd8SLi Zhijian     }
1106b390afd8SLi Zhijian     if (rdma->send_comp_channel) {
1107b390afd8SLi Zhijian         ibv_destroy_comp_channel(rdma->send_comp_channel);
1108b390afd8SLi Zhijian     }
1109b390afd8SLi Zhijian     if (rdma->recv_cq) {
1110b390afd8SLi Zhijian         ibv_destroy_cq(rdma->recv_cq);
1111b390afd8SLi Zhijian         rdma->recv_cq = NULL;
1112329c9b10SDr. David Alan Gilbert     }
1113329c9b10SDr. David Alan Gilbert     rdma->pd = NULL;
1114b390afd8SLi Zhijian     rdma->recv_comp_channel = NULL;
1115b390afd8SLi Zhijian     rdma->send_comp_channel = NULL;
1116329c9b10SDr. David Alan Gilbert     return -1;
1117329c9b10SDr. David Alan Gilbert 
1118329c9b10SDr. David Alan Gilbert }
1119329c9b10SDr. David Alan Gilbert 
1120329c9b10SDr. David Alan Gilbert /*
1121329c9b10SDr. David Alan Gilbert  * Create queue pairs.
1122329c9b10SDr. David Alan Gilbert  */
1123329c9b10SDr. David Alan Gilbert static int qemu_rdma_alloc_qp(RDMAContext *rdma)
1124329c9b10SDr. David Alan Gilbert {
1125329c9b10SDr. David Alan Gilbert     struct ibv_qp_init_attr attr = { 0 };
1126329c9b10SDr. David Alan Gilbert     int ret;
1127329c9b10SDr. David Alan Gilbert 
1128329c9b10SDr. David Alan Gilbert     attr.cap.max_send_wr = RDMA_SIGNALED_SEND_MAX;
1129329c9b10SDr. David Alan Gilbert     attr.cap.max_recv_wr = 3;
1130329c9b10SDr. David Alan Gilbert     attr.cap.max_send_sge = 1;
1131329c9b10SDr. David Alan Gilbert     attr.cap.max_recv_sge = 1;
1132b390afd8SLi Zhijian     attr.send_cq = rdma->send_cq;
1133b390afd8SLi Zhijian     attr.recv_cq = rdma->recv_cq;
1134329c9b10SDr. David Alan Gilbert     attr.qp_type = IBV_QPT_RC;
1135329c9b10SDr. David Alan Gilbert 
1136329c9b10SDr. David Alan Gilbert     ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);
1137329c9b10SDr. David Alan Gilbert     if (ret) {
1138329c9b10SDr. David Alan Gilbert         return -1;
1139329c9b10SDr. David Alan Gilbert     }
1140329c9b10SDr. David Alan Gilbert 
1141329c9b10SDr. David Alan Gilbert     rdma->qp = rdma->cm_id->qp;
1142329c9b10SDr. David Alan Gilbert     return 0;
1143329c9b10SDr. David Alan Gilbert }
1144329c9b10SDr. David Alan Gilbert 
1145e2daccb0SLi Zhijian /* Check whether On-Demand Paging is supported by RDAM device */
1146e2daccb0SLi Zhijian static bool rdma_support_odp(struct ibv_context *dev)
1147e2daccb0SLi Zhijian {
1148e2daccb0SLi Zhijian     struct ibv_device_attr_ex attr = {0};
1149e2daccb0SLi Zhijian     int ret = ibv_query_device_ex(dev, NULL, &attr);
1150e2daccb0SLi Zhijian     if (ret) {
1151e2daccb0SLi Zhijian         return false;
1152e2daccb0SLi Zhijian     }
1153e2daccb0SLi Zhijian 
1154e2daccb0SLi Zhijian     if (attr.odp_caps.general_caps & IBV_ODP_SUPPORT) {
1155e2daccb0SLi Zhijian         return true;
1156e2daccb0SLi Zhijian     }
1157e2daccb0SLi Zhijian 
1158e2daccb0SLi Zhijian     return false;
1159e2daccb0SLi Zhijian }
1160e2daccb0SLi Zhijian 
1161911965acSLi Zhijian /*
1162911965acSLi Zhijian  * ibv_advise_mr to avoid RNR NAK error as far as possible.
1163911965acSLi Zhijian  * The responder mr registering with ODP will sent RNR NAK back to
1164911965acSLi Zhijian  * the requester in the face of the page fault.
1165911965acSLi Zhijian  */
1166911965acSLi Zhijian static void qemu_rdma_advise_prefetch_mr(struct ibv_pd *pd, uint64_t addr,
1167911965acSLi Zhijian                                          uint32_t len,  uint32_t lkey,
1168911965acSLi Zhijian                                          const char *name, bool wr)
1169911965acSLi Zhijian {
1170911965acSLi Zhijian #ifdef HAVE_IBV_ADVISE_MR
1171911965acSLi Zhijian     int ret;
1172911965acSLi Zhijian     int advice = wr ? IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE :
1173911965acSLi Zhijian                  IBV_ADVISE_MR_ADVICE_PREFETCH;
1174911965acSLi Zhijian     struct ibv_sge sg_list = {.lkey = lkey, .addr = addr, .length = len};
1175911965acSLi Zhijian 
1176911965acSLi Zhijian     ret = ibv_advise_mr(pd, advice,
1177911965acSLi Zhijian                         IBV_ADVISE_MR_FLAG_FLUSH, &sg_list, 1);
1178911965acSLi Zhijian     /* ignore the error */
1179911965acSLi Zhijian     if (ret) {
1180911965acSLi Zhijian         trace_qemu_rdma_advise_mr(name, len, addr, strerror(errno));
1181911965acSLi Zhijian     } else {
1182911965acSLi Zhijian         trace_qemu_rdma_advise_mr(name, len, addr, "successed");
1183911965acSLi Zhijian     }
1184911965acSLi Zhijian #endif
1185911965acSLi Zhijian }
1186911965acSLi Zhijian 
1187329c9b10SDr. David Alan Gilbert static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
1188329c9b10SDr. David Alan Gilbert {
1189329c9b10SDr. David Alan Gilbert     int i;
1190329c9b10SDr. David Alan Gilbert     RDMALocalBlocks *local = &rdma->local_ram_blocks;
1191329c9b10SDr. David Alan Gilbert 
1192329c9b10SDr. David Alan Gilbert     for (i = 0; i < local->nb_blocks; i++) {
1193e2daccb0SLi Zhijian         int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE;
1194e2daccb0SLi Zhijian 
1195329c9b10SDr. David Alan Gilbert         local->block[i].mr =
1196329c9b10SDr. David Alan Gilbert             ibv_reg_mr(rdma->pd,
1197329c9b10SDr. David Alan Gilbert                     local->block[i].local_host_addr,
1198e2daccb0SLi Zhijian                     local->block[i].length, access
1199329c9b10SDr. David Alan Gilbert                     );
1200e2daccb0SLi Zhijian 
1201e2daccb0SLi Zhijian         if (!local->block[i].mr &&
1202e2daccb0SLi Zhijian             errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
1203e2daccb0SLi Zhijian                 access |= IBV_ACCESS_ON_DEMAND;
1204e2daccb0SLi Zhijian                 /* register ODP mr */
1205e2daccb0SLi Zhijian                 local->block[i].mr =
1206e2daccb0SLi Zhijian                     ibv_reg_mr(rdma->pd,
1207e2daccb0SLi Zhijian                                local->block[i].local_host_addr,
1208e2daccb0SLi Zhijian                                local->block[i].length, access);
1209e2daccb0SLi Zhijian                 trace_qemu_rdma_register_odp_mr(local->block[i].block_name);
1210911965acSLi Zhijian 
1211911965acSLi Zhijian                 if (local->block[i].mr) {
1212911965acSLi Zhijian                     qemu_rdma_advise_prefetch_mr(rdma->pd,
1213911965acSLi Zhijian                                     (uintptr_t)local->block[i].local_host_addr,
1214911965acSLi Zhijian                                     local->block[i].length,
1215911965acSLi Zhijian                                     local->block[i].mr->lkey,
1216911965acSLi Zhijian                                     local->block[i].block_name,
1217911965acSLi Zhijian                                     true);
1218911965acSLi Zhijian                 }
1219e2daccb0SLi Zhijian         }
1220e2daccb0SLi Zhijian 
1221329c9b10SDr. David Alan Gilbert         if (!local->block[i].mr) {
1222eb1960aaSLi Zhijian             perror("Failed to register local dest ram block!");
1223329c9b10SDr. David Alan Gilbert             break;
1224329c9b10SDr. David Alan Gilbert         }
1225329c9b10SDr. David Alan Gilbert         rdma->total_registrations++;
1226329c9b10SDr. David Alan Gilbert     }
1227329c9b10SDr. David Alan Gilbert 
1228329c9b10SDr. David Alan Gilbert     if (i >= local->nb_blocks) {
1229329c9b10SDr. David Alan Gilbert         return 0;
1230329c9b10SDr. David Alan Gilbert     }
1231329c9b10SDr. David Alan Gilbert 
1232329c9b10SDr. David Alan Gilbert     for (i--; i >= 0; i--) {
1233329c9b10SDr. David Alan Gilbert         ibv_dereg_mr(local->block[i].mr);
1234224f364aSLi Zhijian         local->block[i].mr = NULL;
1235329c9b10SDr. David Alan Gilbert         rdma->total_registrations--;
1236329c9b10SDr. David Alan Gilbert     }
1237329c9b10SDr. David Alan Gilbert 
1238329c9b10SDr. David Alan Gilbert     return -1;
1239329c9b10SDr. David Alan Gilbert 
1240329c9b10SDr. David Alan Gilbert }
1241329c9b10SDr. David Alan Gilbert 
1242329c9b10SDr. David Alan Gilbert /*
1243329c9b10SDr. David Alan Gilbert  * Find the ram block that corresponds to the page requested to be
1244329c9b10SDr. David Alan Gilbert  * transmitted by QEMU.
1245329c9b10SDr. David Alan Gilbert  *
1246329c9b10SDr. David Alan Gilbert  * Once the block is found, also identify which 'chunk' within that
1247329c9b10SDr. David Alan Gilbert  * block that the page belongs to.
1248329c9b10SDr. David Alan Gilbert  *
1249329c9b10SDr. David Alan Gilbert  * This search cannot fail or the migration will fail.
1250329c9b10SDr. David Alan Gilbert  */
1251329c9b10SDr. David Alan Gilbert static int qemu_rdma_search_ram_block(RDMAContext *rdma,
1252fbce8c25SStefan Weil                                       uintptr_t block_offset,
1253329c9b10SDr. David Alan Gilbert                                       uint64_t offset,
1254329c9b10SDr. David Alan Gilbert                                       uint64_t length,
1255329c9b10SDr. David Alan Gilbert                                       uint64_t *block_index,
1256329c9b10SDr. David Alan Gilbert                                       uint64_t *chunk_index)
1257329c9b10SDr. David Alan Gilbert {
1258329c9b10SDr. David Alan Gilbert     uint64_t current_addr = block_offset + offset;
1259329c9b10SDr. David Alan Gilbert     RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap,
1260329c9b10SDr. David Alan Gilbert                                                 (void *) block_offset);
1261329c9b10SDr. David Alan Gilbert     assert(block);
1262329c9b10SDr. David Alan Gilbert     assert(current_addr >= block->offset);
1263329c9b10SDr. David Alan Gilbert     assert((current_addr + length) <= (block->offset + block->length));
1264329c9b10SDr. David Alan Gilbert 
1265329c9b10SDr. David Alan Gilbert     *block_index = block->index;
1266329c9b10SDr. David Alan Gilbert     *chunk_index = ram_chunk_index(block->local_host_addr,
1267329c9b10SDr. David Alan Gilbert                 block->local_host_addr + (current_addr - block->offset));
1268329c9b10SDr. David Alan Gilbert 
1269329c9b10SDr. David Alan Gilbert     return 0;
1270329c9b10SDr. David Alan Gilbert }
1271329c9b10SDr. David Alan Gilbert 
1272329c9b10SDr. David Alan Gilbert /*
1273329c9b10SDr. David Alan Gilbert  * Register a chunk with IB. If the chunk was already registered
1274329c9b10SDr. David Alan Gilbert  * previously, then skip.
1275329c9b10SDr. David Alan Gilbert  *
1276329c9b10SDr. David Alan Gilbert  * Also return the keys associated with the registration needed
1277329c9b10SDr. David Alan Gilbert  * to perform the actual RDMA operation.
1278329c9b10SDr. David Alan Gilbert  */
1279329c9b10SDr. David Alan Gilbert static int qemu_rdma_register_and_get_keys(RDMAContext *rdma,
12803ac040c0SStefan Weil         RDMALocalBlock *block, uintptr_t host_addr,
1281329c9b10SDr. David Alan Gilbert         uint32_t *lkey, uint32_t *rkey, int chunk,
1282329c9b10SDr. David Alan Gilbert         uint8_t *chunk_start, uint8_t *chunk_end)
1283329c9b10SDr. David Alan Gilbert {
1284329c9b10SDr. David Alan Gilbert     if (block->mr) {
1285329c9b10SDr. David Alan Gilbert         if (lkey) {
1286329c9b10SDr. David Alan Gilbert             *lkey = block->mr->lkey;
1287329c9b10SDr. David Alan Gilbert         }
1288329c9b10SDr. David Alan Gilbert         if (rkey) {
1289329c9b10SDr. David Alan Gilbert             *rkey = block->mr->rkey;
1290329c9b10SDr. David Alan Gilbert         }
1291329c9b10SDr. David Alan Gilbert         return 0;
1292329c9b10SDr. David Alan Gilbert     }
1293329c9b10SDr. David Alan Gilbert 
1294329c9b10SDr. David Alan Gilbert     /* allocate memory to store chunk MRs */
1295329c9b10SDr. David Alan Gilbert     if (!block->pmr) {
129697f3ad35SMarkus Armbruster         block->pmr = g_new0(struct ibv_mr *, block->nb_chunks);
1297329c9b10SDr. David Alan Gilbert     }
1298329c9b10SDr. David Alan Gilbert 
1299329c9b10SDr. David Alan Gilbert     /*
1300329c9b10SDr. David Alan Gilbert      * If 'rkey', then we're the destination, so grant access to the source.
1301329c9b10SDr. David Alan Gilbert      *
1302329c9b10SDr. David Alan Gilbert      * If 'lkey', then we're the source VM, so grant access only to ourselves.
1303329c9b10SDr. David Alan Gilbert      */
1304329c9b10SDr. David Alan Gilbert     if (!block->pmr[chunk]) {
1305329c9b10SDr. David Alan Gilbert         uint64_t len = chunk_end - chunk_start;
1306e2daccb0SLi Zhijian         int access = rkey ? IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE :
1307e2daccb0SLi Zhijian                      0;
1308329c9b10SDr. David Alan Gilbert 
1309733252deSDr. David Alan Gilbert         trace_qemu_rdma_register_and_get_keys(len, chunk_start);
1310329c9b10SDr. David Alan Gilbert 
1311e2daccb0SLi Zhijian         block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
1312e2daccb0SLi Zhijian         if (!block->pmr[chunk] &&
1313e2daccb0SLi Zhijian             errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
1314e2daccb0SLi Zhijian             access |= IBV_ACCESS_ON_DEMAND;
1315e2daccb0SLi Zhijian             /* register ODP mr */
1316e2daccb0SLi Zhijian             block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
1317e2daccb0SLi Zhijian             trace_qemu_rdma_register_odp_mr(block->block_name);
1318911965acSLi Zhijian 
1319911965acSLi Zhijian             if (block->pmr[chunk]) {
1320911965acSLi Zhijian                 qemu_rdma_advise_prefetch_mr(rdma->pd, (uintptr_t)chunk_start,
1321911965acSLi Zhijian                                             len, block->pmr[chunk]->lkey,
1322911965acSLi Zhijian                                             block->block_name, rkey);
1323911965acSLi Zhijian 
1324911965acSLi Zhijian             }
1325e2daccb0SLi Zhijian         }
1326e2daccb0SLi Zhijian     }
1327329c9b10SDr. David Alan Gilbert     if (!block->pmr[chunk]) {
1328329c9b10SDr. David Alan Gilbert         perror("Failed to register chunk!");
1329329c9b10SDr. David Alan Gilbert         fprintf(stderr, "Chunk details: block: %d chunk index %d"
13303ac040c0SStefan Weil                         " start %" PRIuPTR " end %" PRIuPTR
13313ac040c0SStefan Weil                         " host %" PRIuPTR
13323ac040c0SStefan Weil                         " local %" PRIuPTR " registrations: %d\n",
13333ac040c0SStefan Weil                         block->index, chunk, (uintptr_t)chunk_start,
13343ac040c0SStefan Weil                         (uintptr_t)chunk_end, host_addr,
13353ac040c0SStefan Weil                         (uintptr_t)block->local_host_addr,
1336329c9b10SDr. David Alan Gilbert                         rdma->total_registrations);
1337329c9b10SDr. David Alan Gilbert         return -1;
1338329c9b10SDr. David Alan Gilbert     }
1339329c9b10SDr. David Alan Gilbert     rdma->total_registrations++;
1340329c9b10SDr. David Alan Gilbert 
1341329c9b10SDr. David Alan Gilbert     if (lkey) {
1342329c9b10SDr. David Alan Gilbert         *lkey = block->pmr[chunk]->lkey;
1343329c9b10SDr. David Alan Gilbert     }
1344329c9b10SDr. David Alan Gilbert     if (rkey) {
1345329c9b10SDr. David Alan Gilbert         *rkey = block->pmr[chunk]->rkey;
1346329c9b10SDr. David Alan Gilbert     }
1347329c9b10SDr. David Alan Gilbert     return 0;
1348329c9b10SDr. David Alan Gilbert }
1349329c9b10SDr. David Alan Gilbert 
1350329c9b10SDr. David Alan Gilbert /*
1351329c9b10SDr. David Alan Gilbert  * Register (at connection time) the memory used for control
1352329c9b10SDr. David Alan Gilbert  * channel messages.
1353329c9b10SDr. David Alan Gilbert  */
1354329c9b10SDr. David Alan Gilbert static int qemu_rdma_reg_control(RDMAContext *rdma, int idx)
1355329c9b10SDr. David Alan Gilbert {
1356329c9b10SDr. David Alan Gilbert     rdma->wr_data[idx].control_mr = ibv_reg_mr(rdma->pd,
1357329c9b10SDr. David Alan Gilbert             rdma->wr_data[idx].control, RDMA_CONTROL_MAX_BUFFER,
1358329c9b10SDr. David Alan Gilbert             IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
1359329c9b10SDr. David Alan Gilbert     if (rdma->wr_data[idx].control_mr) {
1360329c9b10SDr. David Alan Gilbert         rdma->total_registrations++;
1361329c9b10SDr. David Alan Gilbert         return 0;
1362329c9b10SDr. David Alan Gilbert     }
1363733252deSDr. David Alan Gilbert     error_report("qemu_rdma_reg_control failed");
1364329c9b10SDr. David Alan Gilbert     return -1;
1365329c9b10SDr. David Alan Gilbert }
1366329c9b10SDr. David Alan Gilbert 
1367329c9b10SDr. David Alan Gilbert const char *print_wrid(int wrid)
1368329c9b10SDr. David Alan Gilbert {
1369329c9b10SDr. David Alan Gilbert     if (wrid >= RDMA_WRID_RECV_CONTROL) {
1370329c9b10SDr. David Alan Gilbert         return wrid_desc[RDMA_WRID_RECV_CONTROL];
1371329c9b10SDr. David Alan Gilbert     }
1372329c9b10SDr. David Alan Gilbert     return wrid_desc[wrid];
1373329c9b10SDr. David Alan Gilbert }
1374329c9b10SDr. David Alan Gilbert 
1375329c9b10SDr. David Alan Gilbert /*
1376329c9b10SDr. David Alan Gilbert  * Perform a non-optimized memory unregistration after every transfer
137724ec68efSDr. David Alan Gilbert  * for demonstration purposes, only if pin-all is not requested.
1378329c9b10SDr. David Alan Gilbert  *
1379329c9b10SDr. David Alan Gilbert  * Potential optimizations:
1380329c9b10SDr. David Alan Gilbert  * 1. Start a new thread to run this function continuously
1381329c9b10SDr. David Alan Gilbert         - for bit clearing
1382329c9b10SDr. David Alan Gilbert         - and for receipt of unregister messages
1383329c9b10SDr. David Alan Gilbert  * 2. Use an LRU.
1384329c9b10SDr. David Alan Gilbert  * 3. Use workload hints.
1385329c9b10SDr. David Alan Gilbert  */
1386329c9b10SDr. David Alan Gilbert static int qemu_rdma_unregister_waiting(RDMAContext *rdma)
1387329c9b10SDr. David Alan Gilbert {
1388329c9b10SDr. David Alan Gilbert     while (rdma->unregistrations[rdma->unregister_current]) {
1389329c9b10SDr. David Alan Gilbert         int ret;
1390329c9b10SDr. David Alan Gilbert         uint64_t wr_id = rdma->unregistrations[rdma->unregister_current];
1391329c9b10SDr. David Alan Gilbert         uint64_t chunk =
1392329c9b10SDr. David Alan Gilbert             (wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
1393329c9b10SDr. David Alan Gilbert         uint64_t index =
1394329c9b10SDr. David Alan Gilbert             (wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
1395329c9b10SDr. David Alan Gilbert         RDMALocalBlock *block =
1396329c9b10SDr. David Alan Gilbert             &(rdma->local_ram_blocks.block[index]);
1397329c9b10SDr. David Alan Gilbert         RDMARegister reg = { .current_index = index };
1398329c9b10SDr. David Alan Gilbert         RDMAControlHeader resp = { .type = RDMA_CONTROL_UNREGISTER_FINISHED,
1399329c9b10SDr. David Alan Gilbert                                  };
1400329c9b10SDr. David Alan Gilbert         RDMAControlHeader head = { .len = sizeof(RDMARegister),
1401329c9b10SDr. David Alan Gilbert                                    .type = RDMA_CONTROL_UNREGISTER_REQUEST,
1402329c9b10SDr. David Alan Gilbert                                    .repeat = 1,
1403329c9b10SDr. David Alan Gilbert                                  };
1404329c9b10SDr. David Alan Gilbert 
1405733252deSDr. David Alan Gilbert         trace_qemu_rdma_unregister_waiting_proc(chunk,
1406733252deSDr. David Alan Gilbert                                                 rdma->unregister_current);
1407329c9b10SDr. David Alan Gilbert 
1408329c9b10SDr. David Alan Gilbert         rdma->unregistrations[rdma->unregister_current] = 0;
1409329c9b10SDr. David Alan Gilbert         rdma->unregister_current++;
1410329c9b10SDr. David Alan Gilbert 
1411329c9b10SDr. David Alan Gilbert         if (rdma->unregister_current == RDMA_SIGNALED_SEND_MAX) {
1412329c9b10SDr. David Alan Gilbert             rdma->unregister_current = 0;
1413329c9b10SDr. David Alan Gilbert         }
1414329c9b10SDr. David Alan Gilbert 
1415329c9b10SDr. David Alan Gilbert 
1416329c9b10SDr. David Alan Gilbert         /*
1417329c9b10SDr. David Alan Gilbert          * Unregistration is speculative (because migration is single-threaded
1418329c9b10SDr. David Alan Gilbert          * and we cannot break the protocol's inifinband message ordering).
1419329c9b10SDr. David Alan Gilbert          * Thus, if the memory is currently being used for transmission,
1420329c9b10SDr. David Alan Gilbert          * then abort the attempt to unregister and try again
1421329c9b10SDr. David Alan Gilbert          * later the next time a completion is received for this memory.
1422329c9b10SDr. David Alan Gilbert          */
1423329c9b10SDr. David Alan Gilbert         clear_bit(chunk, block->unregister_bitmap);
1424329c9b10SDr. David Alan Gilbert 
1425329c9b10SDr. David Alan Gilbert         if (test_bit(chunk, block->transit_bitmap)) {
1426733252deSDr. David Alan Gilbert             trace_qemu_rdma_unregister_waiting_inflight(chunk);
1427329c9b10SDr. David Alan Gilbert             continue;
1428329c9b10SDr. David Alan Gilbert         }
1429329c9b10SDr. David Alan Gilbert 
1430733252deSDr. David Alan Gilbert         trace_qemu_rdma_unregister_waiting_send(chunk);
1431329c9b10SDr. David Alan Gilbert 
1432329c9b10SDr. David Alan Gilbert         ret = ibv_dereg_mr(block->pmr[chunk]);
1433329c9b10SDr. David Alan Gilbert         block->pmr[chunk] = NULL;
1434329c9b10SDr. David Alan Gilbert         block->remote_keys[chunk] = 0;
1435329c9b10SDr. David Alan Gilbert 
1436329c9b10SDr. David Alan Gilbert         if (ret != 0) {
1437329c9b10SDr. David Alan Gilbert             perror("unregistration chunk failed");
1438329c9b10SDr. David Alan Gilbert             return -ret;
1439329c9b10SDr. David Alan Gilbert         }
1440329c9b10SDr. David Alan Gilbert         rdma->total_registrations--;
1441329c9b10SDr. David Alan Gilbert 
1442329c9b10SDr. David Alan Gilbert         reg.key.chunk = chunk;
1443b12f7777SDr. David Alan Gilbert         register_to_network(rdma, &reg);
1444329c9b10SDr. David Alan Gilbert         ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) &reg,
1445329c9b10SDr. David Alan Gilbert                                 &resp, NULL, NULL);
1446329c9b10SDr. David Alan Gilbert         if (ret < 0) {
1447329c9b10SDr. David Alan Gilbert             return ret;
1448329c9b10SDr. David Alan Gilbert         }
1449329c9b10SDr. David Alan Gilbert 
1450733252deSDr. David Alan Gilbert         trace_qemu_rdma_unregister_waiting_complete(chunk);
1451329c9b10SDr. David Alan Gilbert     }
1452329c9b10SDr. David Alan Gilbert 
1453329c9b10SDr. David Alan Gilbert     return 0;
1454329c9b10SDr. David Alan Gilbert }
1455329c9b10SDr. David Alan Gilbert 
1456329c9b10SDr. David Alan Gilbert static uint64_t qemu_rdma_make_wrid(uint64_t wr_id, uint64_t index,
1457329c9b10SDr. David Alan Gilbert                                          uint64_t chunk)
1458329c9b10SDr. David Alan Gilbert {
1459329c9b10SDr. David Alan Gilbert     uint64_t result = wr_id & RDMA_WRID_TYPE_MASK;
1460329c9b10SDr. David Alan Gilbert 
1461329c9b10SDr. David Alan Gilbert     result |= (index << RDMA_WRID_BLOCK_SHIFT);
1462329c9b10SDr. David Alan Gilbert     result |= (chunk << RDMA_WRID_CHUNK_SHIFT);
1463329c9b10SDr. David Alan Gilbert 
1464329c9b10SDr. David Alan Gilbert     return result;
1465329c9b10SDr. David Alan Gilbert }
1466329c9b10SDr. David Alan Gilbert 
1467329c9b10SDr. David Alan Gilbert /*
1468329c9b10SDr. David Alan Gilbert  * Consult the connection manager to see a work request
1469329c9b10SDr. David Alan Gilbert  * (of any kind) has completed.
1470329c9b10SDr. David Alan Gilbert  * Return the work request ID that completed.
1471329c9b10SDr. David Alan Gilbert  */
1472b390afd8SLi Zhijian static uint64_t qemu_rdma_poll(RDMAContext *rdma, struct ibv_cq *cq,
1473b390afd8SLi Zhijian                                uint64_t *wr_id_out, uint32_t *byte_len)
1474329c9b10SDr. David Alan Gilbert {
1475329c9b10SDr. David Alan Gilbert     int ret;
1476329c9b10SDr. David Alan Gilbert     struct ibv_wc wc;
1477329c9b10SDr. David Alan Gilbert     uint64_t wr_id;
1478329c9b10SDr. David Alan Gilbert 
1479b390afd8SLi Zhijian     ret = ibv_poll_cq(cq, 1, &wc);
1480329c9b10SDr. David Alan Gilbert 
1481329c9b10SDr. David Alan Gilbert     if (!ret) {
1482329c9b10SDr. David Alan Gilbert         *wr_id_out = RDMA_WRID_NONE;
1483329c9b10SDr. David Alan Gilbert         return 0;
1484329c9b10SDr. David Alan Gilbert     }
1485329c9b10SDr. David Alan Gilbert 
1486329c9b10SDr. David Alan Gilbert     if (ret < 0) {
1487733252deSDr. David Alan Gilbert         error_report("ibv_poll_cq return %d", ret);
1488329c9b10SDr. David Alan Gilbert         return ret;
1489329c9b10SDr. David Alan Gilbert     }
1490329c9b10SDr. David Alan Gilbert 
1491329c9b10SDr. David Alan Gilbert     wr_id = wc.wr_id & RDMA_WRID_TYPE_MASK;
1492329c9b10SDr. David Alan Gilbert 
1493329c9b10SDr. David Alan Gilbert     if (wc.status != IBV_WC_SUCCESS) {
1494329c9b10SDr. David Alan Gilbert         fprintf(stderr, "ibv_poll_cq wc.status=%d %s!\n",
1495329c9b10SDr. David Alan Gilbert                         wc.status, ibv_wc_status_str(wc.status));
1496329c9b10SDr. David Alan Gilbert         fprintf(stderr, "ibv_poll_cq wrid=%s!\n", wrid_desc[wr_id]);
1497329c9b10SDr. David Alan Gilbert 
1498329c9b10SDr. David Alan Gilbert         return -1;
1499329c9b10SDr. David Alan Gilbert     }
1500329c9b10SDr. David Alan Gilbert 
1501329c9b10SDr. David Alan Gilbert     if (rdma->control_ready_expected &&
1502329c9b10SDr. David Alan Gilbert         (wr_id >= RDMA_WRID_RECV_CONTROL)) {
1503733252deSDr. David Alan Gilbert         trace_qemu_rdma_poll_recv(wrid_desc[RDMA_WRID_RECV_CONTROL],
1504329c9b10SDr. David Alan Gilbert                   wr_id - RDMA_WRID_RECV_CONTROL, wr_id, rdma->nb_sent);
1505329c9b10SDr. David Alan Gilbert         rdma->control_ready_expected = 0;
1506329c9b10SDr. David Alan Gilbert     }
1507329c9b10SDr. David Alan Gilbert 
1508329c9b10SDr. David Alan Gilbert     if (wr_id == RDMA_WRID_RDMA_WRITE) {
1509329c9b10SDr. David Alan Gilbert         uint64_t chunk =
1510329c9b10SDr. David Alan Gilbert             (wc.wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
1511329c9b10SDr. David Alan Gilbert         uint64_t index =
1512329c9b10SDr. David Alan Gilbert             (wc.wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
1513329c9b10SDr. David Alan Gilbert         RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
1514329c9b10SDr. David Alan Gilbert 
1515733252deSDr. David Alan Gilbert         trace_qemu_rdma_poll_write(print_wrid(wr_id), wr_id, rdma->nb_sent,
1516fbce8c25SStefan Weil                                    index, chunk, block->local_host_addr,
1517fbce8c25SStefan Weil                                    (void *)(uintptr_t)block->remote_host_addr);
1518329c9b10SDr. David Alan Gilbert 
1519329c9b10SDr. David Alan Gilbert         clear_bit(chunk, block->transit_bitmap);
1520329c9b10SDr. David Alan Gilbert 
1521329c9b10SDr. David Alan Gilbert         if (rdma->nb_sent > 0) {
1522329c9b10SDr. David Alan Gilbert             rdma->nb_sent--;
1523329c9b10SDr. David Alan Gilbert         }
1524329c9b10SDr. David Alan Gilbert     } else {
1525733252deSDr. David Alan Gilbert         trace_qemu_rdma_poll_other(print_wrid(wr_id), wr_id, rdma->nb_sent);
1526329c9b10SDr. David Alan Gilbert     }
1527329c9b10SDr. David Alan Gilbert 
1528329c9b10SDr. David Alan Gilbert     *wr_id_out = wc.wr_id;
1529329c9b10SDr. David Alan Gilbert     if (byte_len) {
1530329c9b10SDr. David Alan Gilbert         *byte_len = wc.byte_len;
1531329c9b10SDr. David Alan Gilbert     }
1532329c9b10SDr. David Alan Gilbert 
1533329c9b10SDr. David Alan Gilbert     return  0;
1534329c9b10SDr. David Alan Gilbert }
1535329c9b10SDr. David Alan Gilbert 
15369c98cfbeSDr. David Alan Gilbert /* Wait for activity on the completion channel.
15379c98cfbeSDr. David Alan Gilbert  * Returns 0 on success, none-0 on error.
15389c98cfbeSDr. David Alan Gilbert  */
1539b390afd8SLi Zhijian static int qemu_rdma_wait_comp_channel(RDMAContext *rdma,
1540b390afd8SLi Zhijian                                        struct ibv_comp_channel *comp_channel)
15419c98cfbeSDr. David Alan Gilbert {
1542d5882995SLidong Chen     struct rdma_cm_event *cm_event;
1543d5882995SLidong Chen     int ret = -1;
1544d5882995SLidong Chen 
15459c98cfbeSDr. David Alan Gilbert     /*
15469c98cfbeSDr. David Alan Gilbert      * Coroutine doesn't start until migration_fd_process_incoming()
15479c98cfbeSDr. David Alan Gilbert      * so don't yield unless we know we're running inside of a coroutine.
15489c98cfbeSDr. David Alan Gilbert      */
1549f5627c2aSLidong Chen     if (rdma->migration_started_on_destination &&
1550f5627c2aSLidong Chen         migration_incoming_get_current()->state == MIGRATION_STATUS_ACTIVE) {
1551b390afd8SLi Zhijian         yield_until_fd_readable(comp_channel->fd);
15529c98cfbeSDr. David Alan Gilbert     } else {
15539c98cfbeSDr. David Alan Gilbert         /* This is the source side, we're in a separate thread
15549c98cfbeSDr. David Alan Gilbert          * or destination prior to migration_fd_process_incoming()
15553a4452d8Szhaolichang          * after postcopy, the destination also in a separate thread.
15569c98cfbeSDr. David Alan Gilbert          * we can't yield; so we have to poll the fd.
15579c98cfbeSDr. David Alan Gilbert          * But we need to be able to handle 'cancel' or an error
15589c98cfbeSDr. David Alan Gilbert          * without hanging forever.
15599c98cfbeSDr. David Alan Gilbert          */
15609c98cfbeSDr. David Alan Gilbert         while (!rdma->error_state  && !rdma->received_error) {
1561d5882995SLidong Chen             GPollFD pfds[2];
1562b390afd8SLi Zhijian             pfds[0].fd = comp_channel->fd;
15639c98cfbeSDr. David Alan Gilbert             pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
1564d5882995SLidong Chen             pfds[0].revents = 0;
1565d5882995SLidong Chen 
1566d5882995SLidong Chen             pfds[1].fd = rdma->channel->fd;
1567d5882995SLidong Chen             pfds[1].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
1568d5882995SLidong Chen             pfds[1].revents = 0;
1569d5882995SLidong Chen 
15709c98cfbeSDr. David Alan Gilbert             /* 0.1s timeout, should be fine for a 'cancel' */
1571d5882995SLidong Chen             switch (qemu_poll_ns(pfds, 2, 100 * 1000 * 1000)) {
1572d5882995SLidong Chen             case 2:
15739c98cfbeSDr. David Alan Gilbert             case 1: /* fd active */
1574d5882995SLidong Chen                 if (pfds[0].revents) {
15759c98cfbeSDr. David Alan Gilbert                     return 0;
1576d5882995SLidong Chen                 }
1577d5882995SLidong Chen 
1578d5882995SLidong Chen                 if (pfds[1].revents) {
1579d5882995SLidong Chen                     ret = rdma_get_cm_event(rdma->channel, &cm_event);
15806b8c2eb5SLi Zhijian                     if (ret) {
15816b8c2eb5SLi Zhijian                         error_report("failed to get cm event while wait "
15826b8c2eb5SLi Zhijian                                      "completion channel");
15836b8c2eb5SLi Zhijian                         return -EPIPE;
1584d5882995SLidong Chen                     }
1585d5882995SLidong Chen 
1586d5882995SLidong Chen                     error_report("receive cm event while wait comp channel,"
1587d5882995SLidong Chen                                  "cm event is %d", cm_event->event);
1588d5882995SLidong Chen                     if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
1589d5882995SLidong Chen                         cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
15906b8c2eb5SLi Zhijian                         rdma_ack_cm_event(cm_event);
1591d5882995SLidong Chen                         return -EPIPE;
1592d5882995SLidong Chen                     }
15936b8c2eb5SLi Zhijian                     rdma_ack_cm_event(cm_event);
1594d5882995SLidong Chen                 }
1595d5882995SLidong Chen                 break;
15969c98cfbeSDr. David Alan Gilbert 
15979c98cfbeSDr. David Alan Gilbert             case 0: /* Timeout, go around again */
15989c98cfbeSDr. David Alan Gilbert                 break;
15999c98cfbeSDr. David Alan Gilbert 
16009c98cfbeSDr. David Alan Gilbert             default: /* Error of some type -
16019c98cfbeSDr. David Alan Gilbert                       * I don't trust errno from qemu_poll_ns
16029c98cfbeSDr. David Alan Gilbert                      */
16039c98cfbeSDr. David Alan Gilbert                 error_report("%s: poll failed", __func__);
16049c98cfbeSDr. David Alan Gilbert                 return -EPIPE;
16059c98cfbeSDr. David Alan Gilbert             }
16069c98cfbeSDr. David Alan Gilbert 
16079c98cfbeSDr. David Alan Gilbert             if (migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) {
16089c98cfbeSDr. David Alan Gilbert                 /* Bail out and let the cancellation happen */
16099c98cfbeSDr. David Alan Gilbert                 return -EPIPE;
16109c98cfbeSDr. David Alan Gilbert             }
16119c98cfbeSDr. David Alan Gilbert         }
16129c98cfbeSDr. David Alan Gilbert     }
16139c98cfbeSDr. David Alan Gilbert 
16149c98cfbeSDr. David Alan Gilbert     if (rdma->received_error) {
16159c98cfbeSDr. David Alan Gilbert         return -EPIPE;
16169c98cfbeSDr. David Alan Gilbert     }
16179c98cfbeSDr. David Alan Gilbert     return rdma->error_state;
16189c98cfbeSDr. David Alan Gilbert }
16199c98cfbeSDr. David Alan Gilbert 
1620b390afd8SLi Zhijian static struct ibv_comp_channel *to_channel(RDMAContext *rdma, int wrid)
1621b390afd8SLi Zhijian {
1622b390afd8SLi Zhijian     return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_comp_channel :
1623b390afd8SLi Zhijian            rdma->recv_comp_channel;
1624b390afd8SLi Zhijian }
1625b390afd8SLi Zhijian 
1626b390afd8SLi Zhijian static struct ibv_cq *to_cq(RDMAContext *rdma, int wrid)
1627b390afd8SLi Zhijian {
1628b390afd8SLi Zhijian     return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_cq : rdma->recv_cq;
1629b390afd8SLi Zhijian }
1630b390afd8SLi Zhijian 
1631329c9b10SDr. David Alan Gilbert /*
1632329c9b10SDr. David Alan Gilbert  * Block until the next work request has completed.
1633329c9b10SDr. David Alan Gilbert  *
1634329c9b10SDr. David Alan Gilbert  * First poll to see if a work request has already completed,
1635329c9b10SDr. David Alan Gilbert  * otherwise block.
1636329c9b10SDr. David Alan Gilbert  *
1637329c9b10SDr. David Alan Gilbert  * If we encounter completed work requests for IDs other than
1638329c9b10SDr. David Alan Gilbert  * the one we're interested in, then that's generally an error.
1639329c9b10SDr. David Alan Gilbert  *
1640329c9b10SDr. David Alan Gilbert  * The only exception is actual RDMA Write completions. These
1641329c9b10SDr. David Alan Gilbert  * completions only need to be recorded, but do not actually
1642329c9b10SDr. David Alan Gilbert  * need further processing.
1643329c9b10SDr. David Alan Gilbert  */
1644329c9b10SDr. David Alan Gilbert static int qemu_rdma_block_for_wrid(RDMAContext *rdma, int wrid_requested,
1645329c9b10SDr. David Alan Gilbert                                     uint32_t *byte_len)
1646329c9b10SDr. David Alan Gilbert {
1647329c9b10SDr. David Alan Gilbert     int num_cq_events = 0, ret = 0;
1648329c9b10SDr. David Alan Gilbert     struct ibv_cq *cq;
1649329c9b10SDr. David Alan Gilbert     void *cq_ctx;
1650329c9b10SDr. David Alan Gilbert     uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
1651b390afd8SLi Zhijian     struct ibv_comp_channel *ch = to_channel(rdma, wrid_requested);
1652b390afd8SLi Zhijian     struct ibv_cq *poll_cq = to_cq(rdma, wrid_requested);
1653329c9b10SDr. David Alan Gilbert 
1654b390afd8SLi Zhijian     if (ibv_req_notify_cq(poll_cq, 0)) {
1655329c9b10SDr. David Alan Gilbert         return -1;
1656329c9b10SDr. David Alan Gilbert     }
1657329c9b10SDr. David Alan Gilbert     /* poll cq first */
1658329c9b10SDr. David Alan Gilbert     while (wr_id != wrid_requested) {
1659b390afd8SLi Zhijian         ret = qemu_rdma_poll(rdma, poll_cq, &wr_id_in, byte_len);
1660329c9b10SDr. David Alan Gilbert         if (ret < 0) {
1661329c9b10SDr. David Alan Gilbert             return ret;
1662329c9b10SDr. David Alan Gilbert         }
1663329c9b10SDr. David Alan Gilbert 
1664329c9b10SDr. David Alan Gilbert         wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
1665329c9b10SDr. David Alan Gilbert 
1666329c9b10SDr. David Alan Gilbert         if (wr_id == RDMA_WRID_NONE) {
1667329c9b10SDr. David Alan Gilbert             break;
1668329c9b10SDr. David Alan Gilbert         }
1669329c9b10SDr. David Alan Gilbert         if (wr_id != wrid_requested) {
1670733252deSDr. David Alan Gilbert             trace_qemu_rdma_block_for_wrid_miss(print_wrid(wrid_requested),
1671329c9b10SDr. David Alan Gilbert                        wrid_requested, print_wrid(wr_id), wr_id);
1672329c9b10SDr. David Alan Gilbert         }
1673329c9b10SDr. David Alan Gilbert     }
1674329c9b10SDr. David Alan Gilbert 
1675329c9b10SDr. David Alan Gilbert     if (wr_id == wrid_requested) {
1676329c9b10SDr. David Alan Gilbert         return 0;
1677329c9b10SDr. David Alan Gilbert     }
1678329c9b10SDr. David Alan Gilbert 
1679329c9b10SDr. David Alan Gilbert     while (1) {
1680b390afd8SLi Zhijian         ret = qemu_rdma_wait_comp_channel(rdma, ch);
16819c98cfbeSDr. David Alan Gilbert         if (ret) {
16829c98cfbeSDr. David Alan Gilbert             goto err_block_for_wrid;
1683329c9b10SDr. David Alan Gilbert         }
1684329c9b10SDr. David Alan Gilbert 
1685b390afd8SLi Zhijian         ret = ibv_get_cq_event(ch, &cq, &cq_ctx);
16860b3c15f0SDr. David Alan Gilbert         if (ret) {
1687329c9b10SDr. David Alan Gilbert             perror("ibv_get_cq_event");
1688329c9b10SDr. David Alan Gilbert             goto err_block_for_wrid;
1689329c9b10SDr. David Alan Gilbert         }
1690329c9b10SDr. David Alan Gilbert 
1691329c9b10SDr. David Alan Gilbert         num_cq_events++;
1692329c9b10SDr. David Alan Gilbert 
16930b3c15f0SDr. David Alan Gilbert         ret = -ibv_req_notify_cq(cq, 0);
16940b3c15f0SDr. David Alan Gilbert         if (ret) {
1695329c9b10SDr. David Alan Gilbert             goto err_block_for_wrid;
1696329c9b10SDr. David Alan Gilbert         }
1697329c9b10SDr. David Alan Gilbert 
1698329c9b10SDr. David Alan Gilbert         while (wr_id != wrid_requested) {
1699b390afd8SLi Zhijian             ret = qemu_rdma_poll(rdma, poll_cq, &wr_id_in, byte_len);
1700329c9b10SDr. David Alan Gilbert             if (ret < 0) {
1701329c9b10SDr. David Alan Gilbert                 goto err_block_for_wrid;
1702329c9b10SDr. David Alan Gilbert             }
1703329c9b10SDr. David Alan Gilbert 
1704329c9b10SDr. David Alan Gilbert             wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
1705329c9b10SDr. David Alan Gilbert 
1706329c9b10SDr. David Alan Gilbert             if (wr_id == RDMA_WRID_NONE) {
1707329c9b10SDr. David Alan Gilbert                 break;
1708329c9b10SDr. David Alan Gilbert             }
1709329c9b10SDr. David Alan Gilbert             if (wr_id != wrid_requested) {
1710733252deSDr. David Alan Gilbert                 trace_qemu_rdma_block_for_wrid_miss(print_wrid(wrid_requested),
1711733252deSDr. David Alan Gilbert                                    wrid_requested, print_wrid(wr_id), wr_id);
1712329c9b10SDr. David Alan Gilbert             }
1713329c9b10SDr. David Alan Gilbert         }
1714329c9b10SDr. David Alan Gilbert 
1715329c9b10SDr. David Alan Gilbert         if (wr_id == wrid_requested) {
1716329c9b10SDr. David Alan Gilbert             goto success_block_for_wrid;
1717329c9b10SDr. David Alan Gilbert         }
1718329c9b10SDr. David Alan Gilbert     }
1719329c9b10SDr. David Alan Gilbert 
1720329c9b10SDr. David Alan Gilbert success_block_for_wrid:
1721329c9b10SDr. David Alan Gilbert     if (num_cq_events) {
1722329c9b10SDr. David Alan Gilbert         ibv_ack_cq_events(cq, num_cq_events);
1723329c9b10SDr. David Alan Gilbert     }
1724329c9b10SDr. David Alan Gilbert     return 0;
1725329c9b10SDr. David Alan Gilbert 
1726329c9b10SDr. David Alan Gilbert err_block_for_wrid:
1727329c9b10SDr. David Alan Gilbert     if (num_cq_events) {
1728329c9b10SDr. David Alan Gilbert         ibv_ack_cq_events(cq, num_cq_events);
1729329c9b10SDr. David Alan Gilbert     }
17300b3c15f0SDr. David Alan Gilbert 
17310b3c15f0SDr. David Alan Gilbert     rdma->error_state = ret;
1732329c9b10SDr. David Alan Gilbert     return ret;
1733329c9b10SDr. David Alan Gilbert }
1734329c9b10SDr. David Alan Gilbert 
1735329c9b10SDr. David Alan Gilbert /*
1736329c9b10SDr. David Alan Gilbert  * Post a SEND message work request for the control channel
1737329c9b10SDr. David Alan Gilbert  * containing some data and block until the post completes.
1738329c9b10SDr. David Alan Gilbert  */
1739329c9b10SDr. David Alan Gilbert static int qemu_rdma_post_send_control(RDMAContext *rdma, uint8_t *buf,
1740329c9b10SDr. David Alan Gilbert                                        RDMAControlHeader *head)
1741329c9b10SDr. David Alan Gilbert {
1742329c9b10SDr. David Alan Gilbert     int ret = 0;
1743329c9b10SDr. David Alan Gilbert     RDMAWorkRequestData *wr = &rdma->wr_data[RDMA_WRID_CONTROL];
1744329c9b10SDr. David Alan Gilbert     struct ibv_send_wr *bad_wr;
1745329c9b10SDr. David Alan Gilbert     struct ibv_sge sge = {
1746fbce8c25SStefan Weil                            .addr = (uintptr_t)(wr->control),
1747329c9b10SDr. David Alan Gilbert                            .length = head->len + sizeof(RDMAControlHeader),
1748329c9b10SDr. David Alan Gilbert                            .lkey = wr->control_mr->lkey,
1749329c9b10SDr. David Alan Gilbert                          };
1750329c9b10SDr. David Alan Gilbert     struct ibv_send_wr send_wr = {
1751329c9b10SDr. David Alan Gilbert                                    .wr_id = RDMA_WRID_SEND_CONTROL,
1752329c9b10SDr. David Alan Gilbert                                    .opcode = IBV_WR_SEND,
1753329c9b10SDr. David Alan Gilbert                                    .send_flags = IBV_SEND_SIGNALED,
1754329c9b10SDr. David Alan Gilbert                                    .sg_list = &sge,
1755329c9b10SDr. David Alan Gilbert                                    .num_sge = 1,
1756329c9b10SDr. David Alan Gilbert                                 };
1757329c9b10SDr. David Alan Gilbert 
1758482a33c5SDr. David Alan Gilbert     trace_qemu_rdma_post_send_control(control_desc(head->type));
1759329c9b10SDr. David Alan Gilbert 
1760329c9b10SDr. David Alan Gilbert     /*
1761329c9b10SDr. David Alan Gilbert      * We don't actually need to do a memcpy() in here if we used
1762329c9b10SDr. David Alan Gilbert      * the "sge" properly, but since we're only sending control messages
1763329c9b10SDr. David Alan Gilbert      * (not RAM in a performance-critical path), then its OK for now.
1764329c9b10SDr. David Alan Gilbert      *
1765329c9b10SDr. David Alan Gilbert      * The copy makes the RDMAControlHeader simpler to manipulate
1766329c9b10SDr. David Alan Gilbert      * for the time being.
1767329c9b10SDr. David Alan Gilbert      */
1768329c9b10SDr. David Alan Gilbert     assert(head->len <= RDMA_CONTROL_MAX_BUFFER - sizeof(*head));
1769329c9b10SDr. David Alan Gilbert     memcpy(wr->control, head, sizeof(RDMAControlHeader));
1770329c9b10SDr. David Alan Gilbert     control_to_network((void *) wr->control);
1771329c9b10SDr. David Alan Gilbert 
1772329c9b10SDr. David Alan Gilbert     if (buf) {
1773329c9b10SDr. David Alan Gilbert         memcpy(wr->control + sizeof(RDMAControlHeader), buf, head->len);
1774329c9b10SDr. David Alan Gilbert     }
1775329c9b10SDr. David Alan Gilbert 
1776329c9b10SDr. David Alan Gilbert 
1777329c9b10SDr. David Alan Gilbert     ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
1778329c9b10SDr. David Alan Gilbert 
1779329c9b10SDr. David Alan Gilbert     if (ret > 0) {
1780733252deSDr. David Alan Gilbert         error_report("Failed to use post IB SEND for control");
1781329c9b10SDr. David Alan Gilbert         return -ret;
1782329c9b10SDr. David Alan Gilbert     }
1783329c9b10SDr. David Alan Gilbert 
1784329c9b10SDr. David Alan Gilbert     ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_SEND_CONTROL, NULL);
1785329c9b10SDr. David Alan Gilbert     if (ret < 0) {
1786733252deSDr. David Alan Gilbert         error_report("rdma migration: send polling control error");
1787329c9b10SDr. David Alan Gilbert     }
1788329c9b10SDr. David Alan Gilbert 
1789329c9b10SDr. David Alan Gilbert     return ret;
1790329c9b10SDr. David Alan Gilbert }
1791329c9b10SDr. David Alan Gilbert 
1792329c9b10SDr. David Alan Gilbert /*
1793329c9b10SDr. David Alan Gilbert  * Post a RECV work request in anticipation of some future receipt
1794329c9b10SDr. David Alan Gilbert  * of data on the control channel.
1795329c9b10SDr. David Alan Gilbert  */
1796329c9b10SDr. David Alan Gilbert static int qemu_rdma_post_recv_control(RDMAContext *rdma, int idx)
1797329c9b10SDr. David Alan Gilbert {
1798329c9b10SDr. David Alan Gilbert     struct ibv_recv_wr *bad_wr;
1799329c9b10SDr. David Alan Gilbert     struct ibv_sge sge = {
1800fbce8c25SStefan Weil                             .addr = (uintptr_t)(rdma->wr_data[idx].control),
1801329c9b10SDr. David Alan Gilbert                             .length = RDMA_CONTROL_MAX_BUFFER,
1802329c9b10SDr. David Alan Gilbert                             .lkey = rdma->wr_data[idx].control_mr->lkey,
1803329c9b10SDr. David Alan Gilbert                          };
1804329c9b10SDr. David Alan Gilbert 
1805329c9b10SDr. David Alan Gilbert     struct ibv_recv_wr recv_wr = {
1806329c9b10SDr. David Alan Gilbert                                     .wr_id = RDMA_WRID_RECV_CONTROL + idx,
1807329c9b10SDr. David Alan Gilbert                                     .sg_list = &sge,
1808329c9b10SDr. David Alan Gilbert                                     .num_sge = 1,
1809329c9b10SDr. David Alan Gilbert                                  };
1810329c9b10SDr. David Alan Gilbert 
1811329c9b10SDr. David Alan Gilbert 
1812329c9b10SDr. David Alan Gilbert     if (ibv_post_recv(rdma->qp, &recv_wr, &bad_wr)) {
1813329c9b10SDr. David Alan Gilbert         return -1;
1814329c9b10SDr. David Alan Gilbert     }
1815329c9b10SDr. David Alan Gilbert 
1816329c9b10SDr. David Alan Gilbert     return 0;
1817329c9b10SDr. David Alan Gilbert }
1818329c9b10SDr. David Alan Gilbert 
1819329c9b10SDr. David Alan Gilbert /*
1820329c9b10SDr. David Alan Gilbert  * Block and wait for a RECV control channel message to arrive.
1821329c9b10SDr. David Alan Gilbert  */
1822329c9b10SDr. David Alan Gilbert static int qemu_rdma_exchange_get_response(RDMAContext *rdma,
1823329c9b10SDr. David Alan Gilbert                 RDMAControlHeader *head, int expecting, int idx)
1824329c9b10SDr. David Alan Gilbert {
1825329c9b10SDr. David Alan Gilbert     uint32_t byte_len;
1826329c9b10SDr. David Alan Gilbert     int ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RECV_CONTROL + idx,
1827329c9b10SDr. David Alan Gilbert                                        &byte_len);
1828329c9b10SDr. David Alan Gilbert 
1829329c9b10SDr. David Alan Gilbert     if (ret < 0) {
1830733252deSDr. David Alan Gilbert         error_report("rdma migration: recv polling control error!");
1831329c9b10SDr. David Alan Gilbert         return ret;
1832329c9b10SDr. David Alan Gilbert     }
1833329c9b10SDr. David Alan Gilbert 
1834329c9b10SDr. David Alan Gilbert     network_to_control((void *) rdma->wr_data[idx].control);
1835329c9b10SDr. David Alan Gilbert     memcpy(head, rdma->wr_data[idx].control, sizeof(RDMAControlHeader));
1836329c9b10SDr. David Alan Gilbert 
1837482a33c5SDr. David Alan Gilbert     trace_qemu_rdma_exchange_get_response_start(control_desc(expecting));
1838329c9b10SDr. David Alan Gilbert 
1839329c9b10SDr. David Alan Gilbert     if (expecting == RDMA_CONTROL_NONE) {
1840482a33c5SDr. David Alan Gilbert         trace_qemu_rdma_exchange_get_response_none(control_desc(head->type),
1841733252deSDr. David Alan Gilbert                                              head->type);
1842329c9b10SDr. David Alan Gilbert     } else if (head->type != expecting || head->type == RDMA_CONTROL_ERROR) {
1843733252deSDr. David Alan Gilbert         error_report("Was expecting a %s (%d) control message"
1844733252deSDr. David Alan Gilbert                 ", but got: %s (%d), length: %d",
1845482a33c5SDr. David Alan Gilbert                 control_desc(expecting), expecting,
1846482a33c5SDr. David Alan Gilbert                 control_desc(head->type), head->type, head->len);
1847cd5ea070SDr. David Alan Gilbert         if (head->type == RDMA_CONTROL_ERROR) {
1848cd5ea070SDr. David Alan Gilbert             rdma->received_error = true;
1849cd5ea070SDr. David Alan Gilbert         }
1850329c9b10SDr. David Alan Gilbert         return -EIO;
1851329c9b10SDr. David Alan Gilbert     }
1852329c9b10SDr. David Alan Gilbert     if (head->len > RDMA_CONTROL_MAX_BUFFER - sizeof(*head)) {
185381b07353SGonglei         error_report("too long length: %d", head->len);
1854329c9b10SDr. David Alan Gilbert         return -EINVAL;
1855329c9b10SDr. David Alan Gilbert     }
1856329c9b10SDr. David Alan Gilbert     if (sizeof(*head) + head->len != byte_len) {
1857733252deSDr. David Alan Gilbert         error_report("Malformed length: %d byte_len %d", head->len, byte_len);
1858329c9b10SDr. David Alan Gilbert         return -EINVAL;
1859329c9b10SDr. David Alan Gilbert     }
1860329c9b10SDr. David Alan Gilbert 
1861329c9b10SDr. David Alan Gilbert     return 0;
1862329c9b10SDr. David Alan Gilbert }
1863329c9b10SDr. David Alan Gilbert 
1864329c9b10SDr. David Alan Gilbert /*
1865329c9b10SDr. David Alan Gilbert  * When a RECV work request has completed, the work request's
1866329c9b10SDr. David Alan Gilbert  * buffer is pointed at the header.
1867329c9b10SDr. David Alan Gilbert  *
1868329c9b10SDr. David Alan Gilbert  * This will advance the pointer to the data portion
1869329c9b10SDr. David Alan Gilbert  * of the control message of the work request's buffer that
1870329c9b10SDr. David Alan Gilbert  * was populated after the work request finished.
1871329c9b10SDr. David Alan Gilbert  */
1872329c9b10SDr. David Alan Gilbert static void qemu_rdma_move_header(RDMAContext *rdma, int idx,
1873329c9b10SDr. David Alan Gilbert                                   RDMAControlHeader *head)
1874329c9b10SDr. David Alan Gilbert {
1875329c9b10SDr. David Alan Gilbert     rdma->wr_data[idx].control_len = head->len;
1876329c9b10SDr. David Alan Gilbert     rdma->wr_data[idx].control_curr =
1877329c9b10SDr. David Alan Gilbert         rdma->wr_data[idx].control + sizeof(RDMAControlHeader);
1878329c9b10SDr. David Alan Gilbert }
1879329c9b10SDr. David Alan Gilbert 
1880329c9b10SDr. David Alan Gilbert /*
1881329c9b10SDr. David Alan Gilbert  * This is an 'atomic' high-level operation to deliver a single, unified
1882329c9b10SDr. David Alan Gilbert  * control-channel message.
1883329c9b10SDr. David Alan Gilbert  *
1884329c9b10SDr. David Alan Gilbert  * Additionally, if the user is expecting some kind of reply to this message,
1885329c9b10SDr. David Alan Gilbert  * they can request a 'resp' response message be filled in by posting an
1886329c9b10SDr. David Alan Gilbert  * additional work request on behalf of the user and waiting for an additional
1887329c9b10SDr. David Alan Gilbert  * completion.
1888329c9b10SDr. David Alan Gilbert  *
1889329c9b10SDr. David Alan Gilbert  * The extra (optional) response is used during registration to us from having
1890329c9b10SDr. David Alan Gilbert  * to perform an *additional* exchange of message just to provide a response by
1891329c9b10SDr. David Alan Gilbert  * instead piggy-backing on the acknowledgement.
1892329c9b10SDr. David Alan Gilbert  */
1893329c9b10SDr. David Alan Gilbert static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
1894329c9b10SDr. David Alan Gilbert                                    uint8_t *data, RDMAControlHeader *resp,
1895329c9b10SDr. David Alan Gilbert                                    int *resp_idx,
1896329c9b10SDr. David Alan Gilbert                                    int (*callback)(RDMAContext *rdma))
1897329c9b10SDr. David Alan Gilbert {
1898329c9b10SDr. David Alan Gilbert     int ret = 0;
1899329c9b10SDr. David Alan Gilbert 
1900329c9b10SDr. David Alan Gilbert     /*
1901329c9b10SDr. David Alan Gilbert      * Wait until the dest is ready before attempting to deliver the message
1902329c9b10SDr. David Alan Gilbert      * by waiting for a READY message.
1903329c9b10SDr. David Alan Gilbert      */
1904329c9b10SDr. David Alan Gilbert     if (rdma->control_ready_expected) {
1905329c9b10SDr. David Alan Gilbert         RDMAControlHeader resp;
1906329c9b10SDr. David Alan Gilbert         ret = qemu_rdma_exchange_get_response(rdma,
1907329c9b10SDr. David Alan Gilbert                                     &resp, RDMA_CONTROL_READY, RDMA_WRID_READY);
1908329c9b10SDr. David Alan Gilbert         if (ret < 0) {
1909329c9b10SDr. David Alan Gilbert             return ret;
1910329c9b10SDr. David Alan Gilbert         }
1911329c9b10SDr. David Alan Gilbert     }
1912329c9b10SDr. David Alan Gilbert 
1913329c9b10SDr. David Alan Gilbert     /*
1914329c9b10SDr. David Alan Gilbert      * If the user is expecting a response, post a WR in anticipation of it.
1915329c9b10SDr. David Alan Gilbert      */
1916329c9b10SDr. David Alan Gilbert     if (resp) {
1917329c9b10SDr. David Alan Gilbert         ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_DATA);
1918329c9b10SDr. David Alan Gilbert         if (ret) {
1919733252deSDr. David Alan Gilbert             error_report("rdma migration: error posting"
1920329c9b10SDr. David Alan Gilbert                     " extra control recv for anticipated result!");
1921329c9b10SDr. David Alan Gilbert             return ret;
1922329c9b10SDr. David Alan Gilbert         }
1923329c9b10SDr. David Alan Gilbert     }
1924329c9b10SDr. David Alan Gilbert 
1925329c9b10SDr. David Alan Gilbert     /*
1926329c9b10SDr. David Alan Gilbert      * Post a WR to replace the one we just consumed for the READY message.
1927329c9b10SDr. David Alan Gilbert      */
1928329c9b10SDr. David Alan Gilbert     ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
1929329c9b10SDr. David Alan Gilbert     if (ret) {
1930733252deSDr. David Alan Gilbert         error_report("rdma migration: error posting first control recv!");
1931329c9b10SDr. David Alan Gilbert         return ret;
1932329c9b10SDr. David Alan Gilbert     }
1933329c9b10SDr. David Alan Gilbert 
1934329c9b10SDr. David Alan Gilbert     /*
1935329c9b10SDr. David Alan Gilbert      * Deliver the control message that was requested.
1936329c9b10SDr. David Alan Gilbert      */
1937329c9b10SDr. David Alan Gilbert     ret = qemu_rdma_post_send_control(rdma, data, head);
1938329c9b10SDr. David Alan Gilbert 
1939329c9b10SDr. David Alan Gilbert     if (ret < 0) {
1940733252deSDr. David Alan Gilbert         error_report("Failed to send control buffer!");
1941329c9b10SDr. David Alan Gilbert         return ret;
1942329c9b10SDr. David Alan Gilbert     }
1943329c9b10SDr. David Alan Gilbert 
1944329c9b10SDr. David Alan Gilbert     /*
1945329c9b10SDr. David Alan Gilbert      * If we're expecting a response, block and wait for it.
1946329c9b10SDr. David Alan Gilbert      */
1947329c9b10SDr. David Alan Gilbert     if (resp) {
1948329c9b10SDr. David Alan Gilbert         if (callback) {
1949733252deSDr. David Alan Gilbert             trace_qemu_rdma_exchange_send_issue_callback();
1950329c9b10SDr. David Alan Gilbert             ret = callback(rdma);
1951329c9b10SDr. David Alan Gilbert             if (ret < 0) {
1952329c9b10SDr. David Alan Gilbert                 return ret;
1953329c9b10SDr. David Alan Gilbert             }
1954329c9b10SDr. David Alan Gilbert         }
1955329c9b10SDr. David Alan Gilbert 
1956482a33c5SDr. David Alan Gilbert         trace_qemu_rdma_exchange_send_waiting(control_desc(resp->type));
1957329c9b10SDr. David Alan Gilbert         ret = qemu_rdma_exchange_get_response(rdma, resp,
1958329c9b10SDr. David Alan Gilbert                                               resp->type, RDMA_WRID_DATA);
1959329c9b10SDr. David Alan Gilbert 
1960329c9b10SDr. David Alan Gilbert         if (ret < 0) {
1961329c9b10SDr. David Alan Gilbert             return ret;
1962329c9b10SDr. David Alan Gilbert         }
1963329c9b10SDr. David Alan Gilbert 
1964329c9b10SDr. David Alan Gilbert         qemu_rdma_move_header(rdma, RDMA_WRID_DATA, resp);
1965329c9b10SDr. David Alan Gilbert         if (resp_idx) {
1966329c9b10SDr. David Alan Gilbert             *resp_idx = RDMA_WRID_DATA;
1967329c9b10SDr. David Alan Gilbert         }
1968482a33c5SDr. David Alan Gilbert         trace_qemu_rdma_exchange_send_received(control_desc(resp->type));
1969329c9b10SDr. David Alan Gilbert     }
1970329c9b10SDr. David Alan Gilbert 
1971329c9b10SDr. David Alan Gilbert     rdma->control_ready_expected = 1;
1972329c9b10SDr. David Alan Gilbert 
1973329c9b10SDr. David Alan Gilbert     return 0;
1974329c9b10SDr. David Alan Gilbert }
1975329c9b10SDr. David Alan Gilbert 
1976329c9b10SDr. David Alan Gilbert /*
1977329c9b10SDr. David Alan Gilbert  * This is an 'atomic' high-level operation to receive a single, unified
1978329c9b10SDr. David Alan Gilbert  * control-channel message.
1979329c9b10SDr. David Alan Gilbert  */
1980329c9b10SDr. David Alan Gilbert static int qemu_rdma_exchange_recv(RDMAContext *rdma, RDMAControlHeader *head,
1981329c9b10SDr. David Alan Gilbert                                 int expecting)
1982329c9b10SDr. David Alan Gilbert {
1983329c9b10SDr. David Alan Gilbert     RDMAControlHeader ready = {
1984329c9b10SDr. David Alan Gilbert                                 .len = 0,
1985329c9b10SDr. David Alan Gilbert                                 .type = RDMA_CONTROL_READY,
1986329c9b10SDr. David Alan Gilbert                                 .repeat = 1,
1987329c9b10SDr. David Alan Gilbert                               };
1988329c9b10SDr. David Alan Gilbert     int ret;
1989329c9b10SDr. David Alan Gilbert 
1990329c9b10SDr. David Alan Gilbert     /*
1991329c9b10SDr. David Alan Gilbert      * Inform the source that we're ready to receive a message.
1992329c9b10SDr. David Alan Gilbert      */
1993329c9b10SDr. David Alan Gilbert     ret = qemu_rdma_post_send_control(rdma, NULL, &ready);
1994329c9b10SDr. David Alan Gilbert 
1995329c9b10SDr. David Alan Gilbert     if (ret < 0) {
1996733252deSDr. David Alan Gilbert         error_report("Failed to send control buffer!");
1997329c9b10SDr. David Alan Gilbert         return ret;
1998329c9b10SDr. David Alan Gilbert     }
1999329c9b10SDr. David Alan Gilbert 
2000329c9b10SDr. David Alan Gilbert     /*
2001329c9b10SDr. David Alan Gilbert      * Block and wait for the message.
2002329c9b10SDr. David Alan Gilbert      */
2003329c9b10SDr. David Alan Gilbert     ret = qemu_rdma_exchange_get_response(rdma, head,
2004329c9b10SDr. David Alan Gilbert                                           expecting, RDMA_WRID_READY);
2005329c9b10SDr. David Alan Gilbert 
2006329c9b10SDr. David Alan Gilbert     if (ret < 0) {
2007329c9b10SDr. David Alan Gilbert         return ret;
2008329c9b10SDr. David Alan Gilbert     }
2009329c9b10SDr. David Alan Gilbert 
2010329c9b10SDr. David Alan Gilbert     qemu_rdma_move_header(rdma, RDMA_WRID_READY, head);
2011329c9b10SDr. David Alan Gilbert 
2012329c9b10SDr. David Alan Gilbert     /*
2013329c9b10SDr. David Alan Gilbert      * Post a new RECV work request to replace the one we just consumed.
2014329c9b10SDr. David Alan Gilbert      */
2015329c9b10SDr. David Alan Gilbert     ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
2016329c9b10SDr. David Alan Gilbert     if (ret) {
2017733252deSDr. David Alan Gilbert         error_report("rdma migration: error posting second control recv!");
2018329c9b10SDr. David Alan Gilbert         return ret;
2019329c9b10SDr. David Alan Gilbert     }
2020329c9b10SDr. David Alan Gilbert 
2021329c9b10SDr. David Alan Gilbert     return 0;
2022329c9b10SDr. David Alan Gilbert }
2023329c9b10SDr. David Alan Gilbert 
2024329c9b10SDr. David Alan Gilbert /*
2025329c9b10SDr. David Alan Gilbert  * Write an actual chunk of memory using RDMA.
2026329c9b10SDr. David Alan Gilbert  *
2027329c9b10SDr. David Alan Gilbert  * If we're using dynamic registration on the dest-side, we have to
2028329c9b10SDr. David Alan Gilbert  * send a registration command first.
2029329c9b10SDr. David Alan Gilbert  */
2030329c9b10SDr. David Alan Gilbert static int qemu_rdma_write_one(QEMUFile *f, RDMAContext *rdma,
2031329c9b10SDr. David Alan Gilbert                                int current_index, uint64_t current_addr,
2032329c9b10SDr. David Alan Gilbert                                uint64_t length)
2033329c9b10SDr. David Alan Gilbert {
2034329c9b10SDr. David Alan Gilbert     struct ibv_sge sge;
2035329c9b10SDr. David Alan Gilbert     struct ibv_send_wr send_wr = { 0 };
2036329c9b10SDr. David Alan Gilbert     struct ibv_send_wr *bad_wr;
2037329c9b10SDr. David Alan Gilbert     int reg_result_idx, ret, count = 0;
2038329c9b10SDr. David Alan Gilbert     uint64_t chunk, chunks;
2039329c9b10SDr. David Alan Gilbert     uint8_t *chunk_start, *chunk_end;
2040329c9b10SDr. David Alan Gilbert     RDMALocalBlock *block = &(rdma->local_ram_blocks.block[current_index]);
2041329c9b10SDr. David Alan Gilbert     RDMARegister reg;
2042329c9b10SDr. David Alan Gilbert     RDMARegisterResult *reg_result;
2043329c9b10SDr. David Alan Gilbert     RDMAControlHeader resp = { .type = RDMA_CONTROL_REGISTER_RESULT };
2044329c9b10SDr. David Alan Gilbert     RDMAControlHeader head = { .len = sizeof(RDMARegister),
2045329c9b10SDr. David Alan Gilbert                                .type = RDMA_CONTROL_REGISTER_REQUEST,
2046329c9b10SDr. David Alan Gilbert                                .repeat = 1,
2047329c9b10SDr. David Alan Gilbert                              };
2048329c9b10SDr. David Alan Gilbert 
2049329c9b10SDr. David Alan Gilbert retry:
2050fbce8c25SStefan Weil     sge.addr = (uintptr_t)(block->local_host_addr +
2051329c9b10SDr. David Alan Gilbert                             (current_addr - block->offset));
2052329c9b10SDr. David Alan Gilbert     sge.length = length;
2053329c9b10SDr. David Alan Gilbert 
2054fbce8c25SStefan Weil     chunk = ram_chunk_index(block->local_host_addr,
2055fbce8c25SStefan Weil                             (uint8_t *)(uintptr_t)sge.addr);
2056329c9b10SDr. David Alan Gilbert     chunk_start = ram_chunk_start(block, chunk);
2057329c9b10SDr. David Alan Gilbert 
2058329c9b10SDr. David Alan Gilbert     if (block->is_ram_block) {
2059329c9b10SDr. David Alan Gilbert         chunks = length / (1UL << RDMA_REG_CHUNK_SHIFT);
2060329c9b10SDr. David Alan Gilbert 
2061329c9b10SDr. David Alan Gilbert         if (chunks && ((length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
2062329c9b10SDr. David Alan Gilbert             chunks--;
2063329c9b10SDr. David Alan Gilbert         }
2064329c9b10SDr. David Alan Gilbert     } else {
2065329c9b10SDr. David Alan Gilbert         chunks = block->length / (1UL << RDMA_REG_CHUNK_SHIFT);
2066329c9b10SDr. David Alan Gilbert 
2067329c9b10SDr. David Alan Gilbert         if (chunks && ((block->length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
2068329c9b10SDr. David Alan Gilbert             chunks--;
2069329c9b10SDr. David Alan Gilbert         }
2070329c9b10SDr. David Alan Gilbert     }
2071329c9b10SDr. David Alan Gilbert 
2072733252deSDr. David Alan Gilbert     trace_qemu_rdma_write_one_top(chunks + 1,
2073733252deSDr. David Alan Gilbert                                   (chunks + 1) *
2074733252deSDr. David Alan Gilbert                                   (1UL << RDMA_REG_CHUNK_SHIFT) / 1024 / 1024);
2075329c9b10SDr. David Alan Gilbert 
2076329c9b10SDr. David Alan Gilbert     chunk_end = ram_chunk_end(block, chunk + chunks);
2077329c9b10SDr. David Alan Gilbert 
2078329c9b10SDr. David Alan Gilbert 
2079329c9b10SDr. David Alan Gilbert     while (test_bit(chunk, block->transit_bitmap)) {
2080329c9b10SDr. David Alan Gilbert         (void)count;
2081733252deSDr. David Alan Gilbert         trace_qemu_rdma_write_one_block(count++, current_index, chunk,
2082329c9b10SDr. David Alan Gilbert                 sge.addr, length, rdma->nb_sent, block->nb_chunks);
2083329c9b10SDr. David Alan Gilbert 
2084329c9b10SDr. David Alan Gilbert         ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2085329c9b10SDr. David Alan Gilbert 
2086329c9b10SDr. David Alan Gilbert         if (ret < 0) {
2087733252deSDr. David Alan Gilbert             error_report("Failed to Wait for previous write to complete "
2088329c9b10SDr. David Alan Gilbert                     "block %d chunk %" PRIu64
2089733252deSDr. David Alan Gilbert                     " current %" PRIu64 " len %" PRIu64 " %d",
2090329c9b10SDr. David Alan Gilbert                     current_index, chunk, sge.addr, length, rdma->nb_sent);
2091329c9b10SDr. David Alan Gilbert             return ret;
2092329c9b10SDr. David Alan Gilbert         }
2093329c9b10SDr. David Alan Gilbert     }
2094329c9b10SDr. David Alan Gilbert 
2095329c9b10SDr. David Alan Gilbert     if (!rdma->pin_all || !block->is_ram_block) {
2096329c9b10SDr. David Alan Gilbert         if (!block->remote_keys[chunk]) {
2097329c9b10SDr. David Alan Gilbert             /*
2098329c9b10SDr. David Alan Gilbert              * This chunk has not yet been registered, so first check to see
2099329c9b10SDr. David Alan Gilbert              * if the entire chunk is zero. If so, tell the other size to
2100329c9b10SDr. David Alan Gilbert              * memset() + madvise() the entire chunk without RDMA.
2101329c9b10SDr. David Alan Gilbert              */
2102329c9b10SDr. David Alan Gilbert 
2103a1febc49SRichard Henderson             if (buffer_is_zero((void *)(uintptr_t)sge.addr, length)) {
2104329c9b10SDr. David Alan Gilbert                 RDMACompress comp = {
2105329c9b10SDr. David Alan Gilbert                                         .offset = current_addr,
2106329c9b10SDr. David Alan Gilbert                                         .value = 0,
2107329c9b10SDr. David Alan Gilbert                                         .block_idx = current_index,
2108329c9b10SDr. David Alan Gilbert                                         .length = length,
2109329c9b10SDr. David Alan Gilbert                                     };
2110329c9b10SDr. David Alan Gilbert 
2111329c9b10SDr. David Alan Gilbert                 head.len = sizeof(comp);
2112329c9b10SDr. David Alan Gilbert                 head.type = RDMA_CONTROL_COMPRESS;
2113329c9b10SDr. David Alan Gilbert 
2114733252deSDr. David Alan Gilbert                 trace_qemu_rdma_write_one_zero(chunk, sge.length,
2115733252deSDr. David Alan Gilbert                                                current_index, current_addr);
2116329c9b10SDr. David Alan Gilbert 
2117b12f7777SDr. David Alan Gilbert                 compress_to_network(rdma, &comp);
2118329c9b10SDr. David Alan Gilbert                 ret = qemu_rdma_exchange_send(rdma, &head,
2119329c9b10SDr. David Alan Gilbert                                 (uint8_t *) &comp, NULL, NULL, NULL);
2120329c9b10SDr. David Alan Gilbert 
2121329c9b10SDr. David Alan Gilbert                 if (ret < 0) {
2122329c9b10SDr. David Alan Gilbert                     return -EIO;
2123329c9b10SDr. David Alan Gilbert                 }
2124329c9b10SDr. David Alan Gilbert 
2125*c61d2faaSJuan Quintela                 stat64_add(&mig_stats.zero_pages,
2126*c61d2faaSJuan Quintela                            sge.length / qemu_target_page_size());
2127329c9b10SDr. David Alan Gilbert 
2128329c9b10SDr. David Alan Gilbert                 return 1;
2129329c9b10SDr. David Alan Gilbert             }
2130329c9b10SDr. David Alan Gilbert 
2131329c9b10SDr. David Alan Gilbert             /*
2132329c9b10SDr. David Alan Gilbert              * Otherwise, tell other side to register.
2133329c9b10SDr. David Alan Gilbert              */
2134329c9b10SDr. David Alan Gilbert             reg.current_index = current_index;
2135329c9b10SDr. David Alan Gilbert             if (block->is_ram_block) {
2136329c9b10SDr. David Alan Gilbert                 reg.key.current_addr = current_addr;
2137329c9b10SDr. David Alan Gilbert             } else {
2138329c9b10SDr. David Alan Gilbert                 reg.key.chunk = chunk;
2139329c9b10SDr. David Alan Gilbert             }
2140329c9b10SDr. David Alan Gilbert             reg.chunks = chunks;
2141329c9b10SDr. David Alan Gilbert 
2142733252deSDr. David Alan Gilbert             trace_qemu_rdma_write_one_sendreg(chunk, sge.length, current_index,
2143733252deSDr. David Alan Gilbert                                               current_addr);
2144329c9b10SDr. David Alan Gilbert 
2145b12f7777SDr. David Alan Gilbert             register_to_network(rdma, &reg);
2146329c9b10SDr. David Alan Gilbert             ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) &reg,
2147329c9b10SDr. David Alan Gilbert                                     &resp, &reg_result_idx, NULL);
2148329c9b10SDr. David Alan Gilbert             if (ret < 0) {
2149329c9b10SDr. David Alan Gilbert                 return ret;
2150329c9b10SDr. David Alan Gilbert             }
2151329c9b10SDr. David Alan Gilbert 
2152329c9b10SDr. David Alan Gilbert             /* try to overlap this single registration with the one we sent. */
21533ac040c0SStefan Weil             if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2154329c9b10SDr. David Alan Gilbert                                                 &sge.lkey, NULL, chunk,
2155329c9b10SDr. David Alan Gilbert                                                 chunk_start, chunk_end)) {
2156733252deSDr. David Alan Gilbert                 error_report("cannot get lkey");
2157329c9b10SDr. David Alan Gilbert                 return -EINVAL;
2158329c9b10SDr. David Alan Gilbert             }
2159329c9b10SDr. David Alan Gilbert 
2160329c9b10SDr. David Alan Gilbert             reg_result = (RDMARegisterResult *)
2161329c9b10SDr. David Alan Gilbert                     rdma->wr_data[reg_result_idx].control_curr;
2162329c9b10SDr. David Alan Gilbert 
2163329c9b10SDr. David Alan Gilbert             network_to_result(reg_result);
2164329c9b10SDr. David Alan Gilbert 
2165733252deSDr. David Alan Gilbert             trace_qemu_rdma_write_one_recvregres(block->remote_keys[chunk],
2166733252deSDr. David Alan Gilbert                                                  reg_result->rkey, chunk);
2167329c9b10SDr. David Alan Gilbert 
2168329c9b10SDr. David Alan Gilbert             block->remote_keys[chunk] = reg_result->rkey;
2169329c9b10SDr. David Alan Gilbert             block->remote_host_addr = reg_result->host_addr;
2170329c9b10SDr. David Alan Gilbert         } else {
2171329c9b10SDr. David Alan Gilbert             /* already registered before */
21723ac040c0SStefan Weil             if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2173329c9b10SDr. David Alan Gilbert                                                 &sge.lkey, NULL, chunk,
2174329c9b10SDr. David Alan Gilbert                                                 chunk_start, chunk_end)) {
2175733252deSDr. David Alan Gilbert                 error_report("cannot get lkey!");
2176329c9b10SDr. David Alan Gilbert                 return -EINVAL;
2177329c9b10SDr. David Alan Gilbert             }
2178329c9b10SDr. David Alan Gilbert         }
2179329c9b10SDr. David Alan Gilbert 
2180329c9b10SDr. David Alan Gilbert         send_wr.wr.rdma.rkey = block->remote_keys[chunk];
2181329c9b10SDr. David Alan Gilbert     } else {
2182329c9b10SDr. David Alan Gilbert         send_wr.wr.rdma.rkey = block->remote_rkey;
2183329c9b10SDr. David Alan Gilbert 
21843ac040c0SStefan Weil         if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2185329c9b10SDr. David Alan Gilbert                                                      &sge.lkey, NULL, chunk,
2186329c9b10SDr. David Alan Gilbert                                                      chunk_start, chunk_end)) {
2187733252deSDr. David Alan Gilbert             error_report("cannot get lkey!");
2188329c9b10SDr. David Alan Gilbert             return -EINVAL;
2189329c9b10SDr. David Alan Gilbert         }
2190329c9b10SDr. David Alan Gilbert     }
2191329c9b10SDr. David Alan Gilbert 
2192329c9b10SDr. David Alan Gilbert     /*
2193329c9b10SDr. David Alan Gilbert      * Encode the ram block index and chunk within this wrid.
2194329c9b10SDr. David Alan Gilbert      * We will use this information at the time of completion
2195329c9b10SDr. David Alan Gilbert      * to figure out which bitmap to check against and then which
2196329c9b10SDr. David Alan Gilbert      * chunk in the bitmap to look for.
2197329c9b10SDr. David Alan Gilbert      */
2198329c9b10SDr. David Alan Gilbert     send_wr.wr_id = qemu_rdma_make_wrid(RDMA_WRID_RDMA_WRITE,
2199329c9b10SDr. David Alan Gilbert                                         current_index, chunk);
2200329c9b10SDr. David Alan Gilbert 
2201329c9b10SDr. David Alan Gilbert     send_wr.opcode = IBV_WR_RDMA_WRITE;
2202329c9b10SDr. David Alan Gilbert     send_wr.send_flags = IBV_SEND_SIGNALED;
2203329c9b10SDr. David Alan Gilbert     send_wr.sg_list = &sge;
2204329c9b10SDr. David Alan Gilbert     send_wr.num_sge = 1;
2205329c9b10SDr. David Alan Gilbert     send_wr.wr.rdma.remote_addr = block->remote_host_addr +
2206329c9b10SDr. David Alan Gilbert                                 (current_addr - block->offset);
2207329c9b10SDr. David Alan Gilbert 
2208733252deSDr. David Alan Gilbert     trace_qemu_rdma_write_one_post(chunk, sge.addr, send_wr.wr.rdma.remote_addr,
2209329c9b10SDr. David Alan Gilbert                                    sge.length);
2210329c9b10SDr. David Alan Gilbert 
2211329c9b10SDr. David Alan Gilbert     /*
2212329c9b10SDr. David Alan Gilbert      * ibv_post_send() does not return negative error numbers,
2213329c9b10SDr. David Alan Gilbert      * per the specification they are positive - no idea why.
2214329c9b10SDr. David Alan Gilbert      */
2215329c9b10SDr. David Alan Gilbert     ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
2216329c9b10SDr. David Alan Gilbert 
2217329c9b10SDr. David Alan Gilbert     if (ret == ENOMEM) {
2218733252deSDr. David Alan Gilbert         trace_qemu_rdma_write_one_queue_full();
2219329c9b10SDr. David Alan Gilbert         ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2220329c9b10SDr. David Alan Gilbert         if (ret < 0) {
2221733252deSDr. David Alan Gilbert             error_report("rdma migration: failed to make "
2222733252deSDr. David Alan Gilbert                          "room in full send queue! %d", ret);
2223329c9b10SDr. David Alan Gilbert             return ret;
2224329c9b10SDr. David Alan Gilbert         }
2225329c9b10SDr. David Alan Gilbert 
2226329c9b10SDr. David Alan Gilbert         goto retry;
2227329c9b10SDr. David Alan Gilbert 
2228329c9b10SDr. David Alan Gilbert     } else if (ret > 0) {
2229329c9b10SDr. David Alan Gilbert         perror("rdma migration: post rdma write failed");
2230329c9b10SDr. David Alan Gilbert         return -ret;
2231329c9b10SDr. David Alan Gilbert     }
2232329c9b10SDr. David Alan Gilbert 
2233329c9b10SDr. David Alan Gilbert     set_bit(chunk, block->transit_bitmap);
2234*c61d2faaSJuan Quintela     acct_update_position(f, sge.length);
2235329c9b10SDr. David Alan Gilbert     rdma->total_writes++;
2236329c9b10SDr. David Alan Gilbert 
2237329c9b10SDr. David Alan Gilbert     return 0;
2238329c9b10SDr. David Alan Gilbert }
2239329c9b10SDr. David Alan Gilbert 
2240329c9b10SDr. David Alan Gilbert /*
2241329c9b10SDr. David Alan Gilbert  * Push out any unwritten RDMA operations.
2242329c9b10SDr. David Alan Gilbert  *
2243329c9b10SDr. David Alan Gilbert  * We support sending out multiple chunks at the same time.
2244329c9b10SDr. David Alan Gilbert  * Not all of them need to get signaled in the completion queue.
2245329c9b10SDr. David Alan Gilbert  */
2246329c9b10SDr. David Alan Gilbert static int qemu_rdma_write_flush(QEMUFile *f, RDMAContext *rdma)
2247329c9b10SDr. David Alan Gilbert {
2248329c9b10SDr. David Alan Gilbert     int ret;
2249329c9b10SDr. David Alan Gilbert 
2250329c9b10SDr. David Alan Gilbert     if (!rdma->current_length) {
2251329c9b10SDr. David Alan Gilbert         return 0;
2252329c9b10SDr. David Alan Gilbert     }
2253329c9b10SDr. David Alan Gilbert 
2254329c9b10SDr. David Alan Gilbert     ret = qemu_rdma_write_one(f, rdma,
2255329c9b10SDr. David Alan Gilbert             rdma->current_index, rdma->current_addr, rdma->current_length);
2256329c9b10SDr. David Alan Gilbert 
2257329c9b10SDr. David Alan Gilbert     if (ret < 0) {
2258329c9b10SDr. David Alan Gilbert         return ret;
2259329c9b10SDr. David Alan Gilbert     }
2260329c9b10SDr. David Alan Gilbert 
2261329c9b10SDr. David Alan Gilbert     if (ret == 0) {
2262329c9b10SDr. David Alan Gilbert         rdma->nb_sent++;
2263733252deSDr. David Alan Gilbert         trace_qemu_rdma_write_flush(rdma->nb_sent);
2264329c9b10SDr. David Alan Gilbert     }
2265329c9b10SDr. David Alan Gilbert 
2266329c9b10SDr. David Alan Gilbert     rdma->current_length = 0;
2267329c9b10SDr. David Alan Gilbert     rdma->current_addr = 0;
2268329c9b10SDr. David Alan Gilbert 
2269329c9b10SDr. David Alan Gilbert     return 0;
2270329c9b10SDr. David Alan Gilbert }
2271329c9b10SDr. David Alan Gilbert 
2272329c9b10SDr. David Alan Gilbert static inline int qemu_rdma_buffer_mergable(RDMAContext *rdma,
2273329c9b10SDr. David Alan Gilbert                     uint64_t offset, uint64_t len)
2274329c9b10SDr. David Alan Gilbert {
2275329c9b10SDr. David Alan Gilbert     RDMALocalBlock *block;
2276329c9b10SDr. David Alan Gilbert     uint8_t *host_addr;
2277329c9b10SDr. David Alan Gilbert     uint8_t *chunk_end;
2278329c9b10SDr. David Alan Gilbert 
2279329c9b10SDr. David Alan Gilbert     if (rdma->current_index < 0) {
2280329c9b10SDr. David Alan Gilbert         return 0;
2281329c9b10SDr. David Alan Gilbert     }
2282329c9b10SDr. David Alan Gilbert 
2283329c9b10SDr. David Alan Gilbert     if (rdma->current_chunk < 0) {
2284329c9b10SDr. David Alan Gilbert         return 0;
2285329c9b10SDr. David Alan Gilbert     }
2286329c9b10SDr. David Alan Gilbert 
2287329c9b10SDr. David Alan Gilbert     block = &(rdma->local_ram_blocks.block[rdma->current_index]);
2288329c9b10SDr. David Alan Gilbert     host_addr = block->local_host_addr + (offset - block->offset);
2289329c9b10SDr. David Alan Gilbert     chunk_end = ram_chunk_end(block, rdma->current_chunk);
2290329c9b10SDr. David Alan Gilbert 
2291329c9b10SDr. David Alan Gilbert     if (rdma->current_length == 0) {
2292329c9b10SDr. David Alan Gilbert         return 0;
2293329c9b10SDr. David Alan Gilbert     }
2294329c9b10SDr. David Alan Gilbert 
2295329c9b10SDr. David Alan Gilbert     /*
2296329c9b10SDr. David Alan Gilbert      * Only merge into chunk sequentially.
2297329c9b10SDr. David Alan Gilbert      */
2298329c9b10SDr. David Alan Gilbert     if (offset != (rdma->current_addr + rdma->current_length)) {
2299329c9b10SDr. David Alan Gilbert         return 0;
2300329c9b10SDr. David Alan Gilbert     }
2301329c9b10SDr. David Alan Gilbert 
2302329c9b10SDr. David Alan Gilbert     if (offset < block->offset) {
2303329c9b10SDr. David Alan Gilbert         return 0;
2304329c9b10SDr. David Alan Gilbert     }
2305329c9b10SDr. David Alan Gilbert 
2306329c9b10SDr. David Alan Gilbert     if ((offset + len) > (block->offset + block->length)) {
2307329c9b10SDr. David Alan Gilbert         return 0;
2308329c9b10SDr. David Alan Gilbert     }
2309329c9b10SDr. David Alan Gilbert 
2310329c9b10SDr. David Alan Gilbert     if ((host_addr + len) > chunk_end) {
2311329c9b10SDr. David Alan Gilbert         return 0;
2312329c9b10SDr. David Alan Gilbert     }
2313329c9b10SDr. David Alan Gilbert 
2314329c9b10SDr. David Alan Gilbert     return 1;
2315329c9b10SDr. David Alan Gilbert }
2316329c9b10SDr. David Alan Gilbert 
2317329c9b10SDr. David Alan Gilbert /*
2318329c9b10SDr. David Alan Gilbert  * We're not actually writing here, but doing three things:
2319329c9b10SDr. David Alan Gilbert  *
2320329c9b10SDr. David Alan Gilbert  * 1. Identify the chunk the buffer belongs to.
2321329c9b10SDr. David Alan Gilbert  * 2. If the chunk is full or the buffer doesn't belong to the current
2322329c9b10SDr. David Alan Gilbert  *    chunk, then start a new chunk and flush() the old chunk.
2323329c9b10SDr. David Alan Gilbert  * 3. To keep the hardware busy, we also group chunks into batches
2324329c9b10SDr. David Alan Gilbert  *    and only require that a batch gets acknowledged in the completion
23253a4452d8Szhaolichang  *    queue instead of each individual chunk.
2326329c9b10SDr. David Alan Gilbert  */
2327329c9b10SDr. David Alan Gilbert static int qemu_rdma_write(QEMUFile *f, RDMAContext *rdma,
2328329c9b10SDr. David Alan Gilbert                            uint64_t block_offset, uint64_t offset,
2329329c9b10SDr. David Alan Gilbert                            uint64_t len)
2330329c9b10SDr. David Alan Gilbert {
2331329c9b10SDr. David Alan Gilbert     uint64_t current_addr = block_offset + offset;
2332329c9b10SDr. David Alan Gilbert     uint64_t index = rdma->current_index;
2333329c9b10SDr. David Alan Gilbert     uint64_t chunk = rdma->current_chunk;
2334329c9b10SDr. David Alan Gilbert     int ret;
2335329c9b10SDr. David Alan Gilbert 
2336329c9b10SDr. David Alan Gilbert     /* If we cannot merge it, we flush the current buffer first. */
2337329c9b10SDr. David Alan Gilbert     if (!qemu_rdma_buffer_mergable(rdma, current_addr, len)) {
2338329c9b10SDr. David Alan Gilbert         ret = qemu_rdma_write_flush(f, rdma);
2339329c9b10SDr. David Alan Gilbert         if (ret) {
2340329c9b10SDr. David Alan Gilbert             return ret;
2341329c9b10SDr. David Alan Gilbert         }
2342329c9b10SDr. David Alan Gilbert         rdma->current_length = 0;
2343329c9b10SDr. David Alan Gilbert         rdma->current_addr = current_addr;
2344329c9b10SDr. David Alan Gilbert 
2345329c9b10SDr. David Alan Gilbert         ret = qemu_rdma_search_ram_block(rdma, block_offset,
2346329c9b10SDr. David Alan Gilbert                                          offset, len, &index, &chunk);
2347329c9b10SDr. David Alan Gilbert         if (ret) {
2348733252deSDr. David Alan Gilbert             error_report("ram block search failed");
2349329c9b10SDr. David Alan Gilbert             return ret;
2350329c9b10SDr. David Alan Gilbert         }
2351329c9b10SDr. David Alan Gilbert         rdma->current_index = index;
2352329c9b10SDr. David Alan Gilbert         rdma->current_chunk = chunk;
2353329c9b10SDr. David Alan Gilbert     }
2354329c9b10SDr. David Alan Gilbert 
2355329c9b10SDr. David Alan Gilbert     /* merge it */
2356329c9b10SDr. David Alan Gilbert     rdma->current_length += len;
2357329c9b10SDr. David Alan Gilbert 
2358329c9b10SDr. David Alan Gilbert     /* flush it if buffer is too large */
2359329c9b10SDr. David Alan Gilbert     if (rdma->current_length >= RDMA_MERGE_MAX) {
2360329c9b10SDr. David Alan Gilbert         return qemu_rdma_write_flush(f, rdma);
2361329c9b10SDr. David Alan Gilbert     }
2362329c9b10SDr. David Alan Gilbert 
2363329c9b10SDr. David Alan Gilbert     return 0;
2364329c9b10SDr. David Alan Gilbert }
2365329c9b10SDr. David Alan Gilbert 
2366329c9b10SDr. David Alan Gilbert static void qemu_rdma_cleanup(RDMAContext *rdma)
2367329c9b10SDr. David Alan Gilbert {
2368c5e76115SLidong Chen     int idx;
2369329c9b10SDr. David Alan Gilbert 
2370329c9b10SDr. David Alan Gilbert     if (rdma->cm_id && rdma->connected) {
237132bce196SDr. David Alan Gilbert         if ((rdma->error_state ||
237232bce196SDr. David Alan Gilbert              migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) &&
237332bce196SDr. David Alan Gilbert             !rdma->received_error) {
2374329c9b10SDr. David Alan Gilbert             RDMAControlHeader head = { .len = 0,
2375329c9b10SDr. David Alan Gilbert                                        .type = RDMA_CONTROL_ERROR,
2376329c9b10SDr. David Alan Gilbert                                        .repeat = 1,
2377329c9b10SDr. David Alan Gilbert                                      };
2378733252deSDr. David Alan Gilbert             error_report("Early error. Sending error.");
2379329c9b10SDr. David Alan Gilbert             qemu_rdma_post_send_control(rdma, NULL, &head);
2380329c9b10SDr. David Alan Gilbert         }
2381329c9b10SDr. David Alan Gilbert 
2382c5e76115SLidong Chen         rdma_disconnect(rdma->cm_id);
2383733252deSDr. David Alan Gilbert         trace_qemu_rdma_cleanup_disconnect();
2384329c9b10SDr. David Alan Gilbert         rdma->connected = false;
2385329c9b10SDr. David Alan Gilbert     }
2386329c9b10SDr. David Alan Gilbert 
2387cf75e268SDr. David Alan Gilbert     if (rdma->channel) {
2388fbbaacabSDr. David Alan Gilbert         qemu_set_fd_handler(rdma->channel->fd, NULL, NULL, NULL);
2389cf75e268SDr. David Alan Gilbert     }
2390a97270adSDr. David Alan Gilbert     g_free(rdma->dest_blocks);
2391a97270adSDr. David Alan Gilbert     rdma->dest_blocks = NULL;
2392329c9b10SDr. David Alan Gilbert 
2393329c9b10SDr. David Alan Gilbert     for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2394329c9b10SDr. David Alan Gilbert         if (rdma->wr_data[idx].control_mr) {
2395329c9b10SDr. David Alan Gilbert             rdma->total_registrations--;
2396329c9b10SDr. David Alan Gilbert             ibv_dereg_mr(rdma->wr_data[idx].control_mr);
2397329c9b10SDr. David Alan Gilbert         }
2398329c9b10SDr. David Alan Gilbert         rdma->wr_data[idx].control_mr = NULL;
2399329c9b10SDr. David Alan Gilbert     }
2400329c9b10SDr. David Alan Gilbert 
2401329c9b10SDr. David Alan Gilbert     if (rdma->local_ram_blocks.block) {
2402329c9b10SDr. David Alan Gilbert         while (rdma->local_ram_blocks.nb_blocks) {
240303fcab38SDr. David Alan Gilbert             rdma_delete_block(rdma, &rdma->local_ram_blocks.block[0]);
2404329c9b10SDr. David Alan Gilbert         }
2405329c9b10SDr. David Alan Gilbert     }
2406329c9b10SDr. David Alan Gilbert 
240780b262e1SPadmanabh Ratnakar     if (rdma->qp) {
240880b262e1SPadmanabh Ratnakar         rdma_destroy_qp(rdma->cm_id);
240980b262e1SPadmanabh Ratnakar         rdma->qp = NULL;
241080b262e1SPadmanabh Ratnakar     }
2411b390afd8SLi Zhijian     if (rdma->recv_cq) {
2412b390afd8SLi Zhijian         ibv_destroy_cq(rdma->recv_cq);
2413b390afd8SLi Zhijian         rdma->recv_cq = NULL;
2414329c9b10SDr. David Alan Gilbert     }
2415b390afd8SLi Zhijian     if (rdma->send_cq) {
2416b390afd8SLi Zhijian         ibv_destroy_cq(rdma->send_cq);
2417b390afd8SLi Zhijian         rdma->send_cq = NULL;
2418b390afd8SLi Zhijian     }
2419b390afd8SLi Zhijian     if (rdma->recv_comp_channel) {
2420b390afd8SLi Zhijian         ibv_destroy_comp_channel(rdma->recv_comp_channel);
2421b390afd8SLi Zhijian         rdma->recv_comp_channel = NULL;
2422b390afd8SLi Zhijian     }
2423b390afd8SLi Zhijian     if (rdma->send_comp_channel) {
2424b390afd8SLi Zhijian         ibv_destroy_comp_channel(rdma->send_comp_channel);
2425b390afd8SLi Zhijian         rdma->send_comp_channel = NULL;
2426329c9b10SDr. David Alan Gilbert     }
2427329c9b10SDr. David Alan Gilbert     if (rdma->pd) {
2428329c9b10SDr. David Alan Gilbert         ibv_dealloc_pd(rdma->pd);
2429329c9b10SDr. David Alan Gilbert         rdma->pd = NULL;
2430329c9b10SDr. David Alan Gilbert     }
243180b262e1SPadmanabh Ratnakar     if (rdma->cm_id) {
243280b262e1SPadmanabh Ratnakar         rdma_destroy_id(rdma->cm_id);
243380b262e1SPadmanabh Ratnakar         rdma->cm_id = NULL;
243480b262e1SPadmanabh Ratnakar     }
243555cc1b59SLidong Chen 
243655cc1b59SLidong Chen     /* the destination side, listen_id and channel is shared */
2437329c9b10SDr. David Alan Gilbert     if (rdma->listen_id) {
243855cc1b59SLidong Chen         if (!rdma->is_return_path) {
2439329c9b10SDr. David Alan Gilbert             rdma_destroy_id(rdma->listen_id);
2440329c9b10SDr. David Alan Gilbert         }
244155cc1b59SLidong Chen         rdma->listen_id = NULL;
244255cc1b59SLidong Chen 
244355cc1b59SLidong Chen         if (rdma->channel) {
244455cc1b59SLidong Chen             if (!rdma->is_return_path) {
244555cc1b59SLidong Chen                 rdma_destroy_event_channel(rdma->channel);
244655cc1b59SLidong Chen             }
244755cc1b59SLidong Chen             rdma->channel = NULL;
244855cc1b59SLidong Chen         }
244955cc1b59SLidong Chen     }
245055cc1b59SLidong Chen 
2451329c9b10SDr. David Alan Gilbert     if (rdma->channel) {
2452329c9b10SDr. David Alan Gilbert         rdma_destroy_event_channel(rdma->channel);
2453329c9b10SDr. David Alan Gilbert         rdma->channel = NULL;
2454329c9b10SDr. David Alan Gilbert     }
2455329c9b10SDr. David Alan Gilbert     g_free(rdma->host);
245644bcfd45SLi Zhijian     g_free(rdma->host_port);
2457329c9b10SDr. David Alan Gilbert     rdma->host = NULL;
245844bcfd45SLi Zhijian     rdma->host_port = NULL;
2459329c9b10SDr. David Alan Gilbert }
2460329c9b10SDr. David Alan Gilbert 
2461329c9b10SDr. David Alan Gilbert 
2462bbfb89e3SFam Zheng static int qemu_rdma_source_init(RDMAContext *rdma, bool pin_all, Error **errp)
2463329c9b10SDr. David Alan Gilbert {
2464329c9b10SDr. David Alan Gilbert     int ret, idx;
2465329c9b10SDr. David Alan Gilbert     Error *local_err = NULL, **temp = &local_err;
2466329c9b10SDr. David Alan Gilbert 
2467329c9b10SDr. David Alan Gilbert     /*
2468329c9b10SDr. David Alan Gilbert      * Will be validated against destination's actual capabilities
2469329c9b10SDr. David Alan Gilbert      * after the connect() completes.
2470329c9b10SDr. David Alan Gilbert      */
2471329c9b10SDr. David Alan Gilbert     rdma->pin_all = pin_all;
2472329c9b10SDr. David Alan Gilbert 
2473329c9b10SDr. David Alan Gilbert     ret = qemu_rdma_resolve_host(rdma, temp);
2474329c9b10SDr. David Alan Gilbert     if (ret) {
2475329c9b10SDr. David Alan Gilbert         goto err_rdma_source_init;
2476329c9b10SDr. David Alan Gilbert     }
2477329c9b10SDr. David Alan Gilbert 
2478329c9b10SDr. David Alan Gilbert     ret = qemu_rdma_alloc_pd_cq(rdma);
2479329c9b10SDr. David Alan Gilbert     if (ret) {
2480329c9b10SDr. David Alan Gilbert         ERROR(temp, "rdma migration: error allocating pd and cq! Your mlock()"
2481329c9b10SDr. David Alan Gilbert                     " limits may be too low. Please check $ ulimit -a # and "
2482329c9b10SDr. David Alan Gilbert                     "search for 'ulimit -l' in the output");
2483329c9b10SDr. David Alan Gilbert         goto err_rdma_source_init;
2484329c9b10SDr. David Alan Gilbert     }
2485329c9b10SDr. David Alan Gilbert 
2486329c9b10SDr. David Alan Gilbert     ret = qemu_rdma_alloc_qp(rdma);
2487329c9b10SDr. David Alan Gilbert     if (ret) {
2488329c9b10SDr. David Alan Gilbert         ERROR(temp, "rdma migration: error allocating qp!");
2489329c9b10SDr. David Alan Gilbert         goto err_rdma_source_init;
2490329c9b10SDr. David Alan Gilbert     }
2491329c9b10SDr. David Alan Gilbert 
2492329c9b10SDr. David Alan Gilbert     ret = qemu_rdma_init_ram_blocks(rdma);
2493329c9b10SDr. David Alan Gilbert     if (ret) {
2494329c9b10SDr. David Alan Gilbert         ERROR(temp, "rdma migration: error initializing ram blocks!");
2495329c9b10SDr. David Alan Gilbert         goto err_rdma_source_init;
2496329c9b10SDr. David Alan Gilbert     }
2497329c9b10SDr. David Alan Gilbert 
2498760ff4beSDr. David Alan Gilbert     /* Build the hash that maps from offset to RAMBlock */
2499760ff4beSDr. David Alan Gilbert     rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal);
2500760ff4beSDr. David Alan Gilbert     for (idx = 0; idx < rdma->local_ram_blocks.nb_blocks; idx++) {
2501760ff4beSDr. David Alan Gilbert         g_hash_table_insert(rdma->blockmap,
2502760ff4beSDr. David Alan Gilbert                 (void *)(uintptr_t)rdma->local_ram_blocks.block[idx].offset,
2503760ff4beSDr. David Alan Gilbert                 &rdma->local_ram_blocks.block[idx]);
2504760ff4beSDr. David Alan Gilbert     }
2505760ff4beSDr. David Alan Gilbert 
2506329c9b10SDr. David Alan Gilbert     for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2507329c9b10SDr. David Alan Gilbert         ret = qemu_rdma_reg_control(rdma, idx);
2508329c9b10SDr. David Alan Gilbert         if (ret) {
2509329c9b10SDr. David Alan Gilbert             ERROR(temp, "rdma migration: error registering %d control!",
2510329c9b10SDr. David Alan Gilbert                                                             idx);
2511329c9b10SDr. David Alan Gilbert             goto err_rdma_source_init;
2512329c9b10SDr. David Alan Gilbert         }
2513329c9b10SDr. David Alan Gilbert     }
2514329c9b10SDr. David Alan Gilbert 
2515329c9b10SDr. David Alan Gilbert     return 0;
2516329c9b10SDr. David Alan Gilbert 
2517329c9b10SDr. David Alan Gilbert err_rdma_source_init:
2518329c9b10SDr. David Alan Gilbert     error_propagate(errp, local_err);
2519329c9b10SDr. David Alan Gilbert     qemu_rdma_cleanup(rdma);
2520329c9b10SDr. David Alan Gilbert     return -1;
2521329c9b10SDr. David Alan Gilbert }
2522329c9b10SDr. David Alan Gilbert 
2523e49e49ddSLi Zhijian static int qemu_get_cm_event_timeout(RDMAContext *rdma,
2524e49e49ddSLi Zhijian                                      struct rdma_cm_event **cm_event,
2525e49e49ddSLi Zhijian                                      long msec, Error **errp)
2526e49e49ddSLi Zhijian {
2527e49e49ddSLi Zhijian     int ret;
2528e49e49ddSLi Zhijian     struct pollfd poll_fd = {
2529e49e49ddSLi Zhijian                                 .fd = rdma->channel->fd,
2530e49e49ddSLi Zhijian                                 .events = POLLIN,
2531e49e49ddSLi Zhijian                                 .revents = 0
2532e49e49ddSLi Zhijian                             };
2533e49e49ddSLi Zhijian 
2534e49e49ddSLi Zhijian     do {
2535e49e49ddSLi Zhijian         ret = poll(&poll_fd, 1, msec);
2536e49e49ddSLi Zhijian     } while (ret < 0 && errno == EINTR);
2537e49e49ddSLi Zhijian 
2538e49e49ddSLi Zhijian     if (ret == 0) {
2539e49e49ddSLi Zhijian         ERROR(errp, "poll cm event timeout");
2540e49e49ddSLi Zhijian         return -1;
2541e49e49ddSLi Zhijian     } else if (ret < 0) {
2542e49e49ddSLi Zhijian         ERROR(errp, "failed to poll cm event, errno=%i", errno);
2543e49e49ddSLi Zhijian         return -1;
2544e49e49ddSLi Zhijian     } else if (poll_fd.revents & POLLIN) {
2545e49e49ddSLi Zhijian         return rdma_get_cm_event(rdma->channel, cm_event);
2546e49e49ddSLi Zhijian     } else {
2547e49e49ddSLi Zhijian         ERROR(errp, "no POLLIN event, revent=%x", poll_fd.revents);
2548e49e49ddSLi Zhijian         return -1;
2549e49e49ddSLi Zhijian     }
2550e49e49ddSLi Zhijian }
2551e49e49ddSLi Zhijian 
2552e49e49ddSLi Zhijian static int qemu_rdma_connect(RDMAContext *rdma, Error **errp, bool return_path)
2553329c9b10SDr. David Alan Gilbert {
2554329c9b10SDr. David Alan Gilbert     RDMACapabilities cap = {
2555329c9b10SDr. David Alan Gilbert                                 .version = RDMA_CONTROL_VERSION_CURRENT,
2556329c9b10SDr. David Alan Gilbert                                 .flags = 0,
2557329c9b10SDr. David Alan Gilbert                            };
2558329c9b10SDr. David Alan Gilbert     struct rdma_conn_param conn_param = { .initiator_depth = 2,
2559329c9b10SDr. David Alan Gilbert                                           .retry_count = 5,
2560329c9b10SDr. David Alan Gilbert                                           .private_data = &cap,
2561329c9b10SDr. David Alan Gilbert                                           .private_data_len = sizeof(cap),
2562329c9b10SDr. David Alan Gilbert                                         };
2563329c9b10SDr. David Alan Gilbert     struct rdma_cm_event *cm_event;
2564329c9b10SDr. David Alan Gilbert     int ret;
2565329c9b10SDr. David Alan Gilbert 
2566329c9b10SDr. David Alan Gilbert     /*
2567329c9b10SDr. David Alan Gilbert      * Only negotiate the capability with destination if the user
2568329c9b10SDr. David Alan Gilbert      * on the source first requested the capability.
2569329c9b10SDr. David Alan Gilbert      */
2570329c9b10SDr. David Alan Gilbert     if (rdma->pin_all) {
2571733252deSDr. David Alan Gilbert         trace_qemu_rdma_connect_pin_all_requested();
2572329c9b10SDr. David Alan Gilbert         cap.flags |= RDMA_CAPABILITY_PIN_ALL;
2573329c9b10SDr. David Alan Gilbert     }
2574329c9b10SDr. David Alan Gilbert 
2575329c9b10SDr. David Alan Gilbert     caps_to_network(&cap);
2576329c9b10SDr. David Alan Gilbert 
25779cf2bab2SDr. David Alan Gilbert     ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
25789cf2bab2SDr. David Alan Gilbert     if (ret) {
25799cf2bab2SDr. David Alan Gilbert         ERROR(errp, "posting second control recv");
25809cf2bab2SDr. David Alan Gilbert         goto err_rdma_source_connect;
25819cf2bab2SDr. David Alan Gilbert     }
25829cf2bab2SDr. David Alan Gilbert 
2583329c9b10SDr. David Alan Gilbert     ret = rdma_connect(rdma->cm_id, &conn_param);
2584329c9b10SDr. David Alan Gilbert     if (ret) {
2585329c9b10SDr. David Alan Gilbert         perror("rdma_connect");
2586329c9b10SDr. David Alan Gilbert         ERROR(errp, "connecting to destination!");
2587329c9b10SDr. David Alan Gilbert         goto err_rdma_source_connect;
2588329c9b10SDr. David Alan Gilbert     }
2589329c9b10SDr. David Alan Gilbert 
2590e49e49ddSLi Zhijian     if (return_path) {
2591e49e49ddSLi Zhijian         ret = qemu_get_cm_event_timeout(rdma, &cm_event, 5000, errp);
2592e49e49ddSLi Zhijian     } else {
2593329c9b10SDr. David Alan Gilbert         ret = rdma_get_cm_event(rdma->channel, &cm_event);
2594e49e49ddSLi Zhijian     }
2595329c9b10SDr. David Alan Gilbert     if (ret) {
2596329c9b10SDr. David Alan Gilbert         perror("rdma_get_cm_event after rdma_connect");
2597329c9b10SDr. David Alan Gilbert         ERROR(errp, "connecting to destination!");
2598329c9b10SDr. David Alan Gilbert         goto err_rdma_source_connect;
2599329c9b10SDr. David Alan Gilbert     }
2600329c9b10SDr. David Alan Gilbert 
2601329c9b10SDr. David Alan Gilbert     if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
2602e5f60791SLi Zhijian         error_report("rdma_get_cm_event != EVENT_ESTABLISHED after rdma_connect");
2603329c9b10SDr. David Alan Gilbert         ERROR(errp, "connecting to destination!");
2604329c9b10SDr. David Alan Gilbert         rdma_ack_cm_event(cm_event);
2605329c9b10SDr. David Alan Gilbert         goto err_rdma_source_connect;
2606329c9b10SDr. David Alan Gilbert     }
2607329c9b10SDr. David Alan Gilbert     rdma->connected = true;
2608329c9b10SDr. David Alan Gilbert 
2609329c9b10SDr. David Alan Gilbert     memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
2610329c9b10SDr. David Alan Gilbert     network_to_caps(&cap);
2611329c9b10SDr. David Alan Gilbert 
2612329c9b10SDr. David Alan Gilbert     /*
2613329c9b10SDr. David Alan Gilbert      * Verify that the *requested* capabilities are supported by the destination
2614329c9b10SDr. David Alan Gilbert      * and disable them otherwise.
2615329c9b10SDr. David Alan Gilbert      */
2616329c9b10SDr. David Alan Gilbert     if (rdma->pin_all && !(cap.flags & RDMA_CAPABILITY_PIN_ALL)) {
2617329c9b10SDr. David Alan Gilbert         ERROR(errp, "Server cannot support pinning all memory. "
2618329c9b10SDr. David Alan Gilbert                         "Will register memory dynamically.");
2619329c9b10SDr. David Alan Gilbert         rdma->pin_all = false;
2620329c9b10SDr. David Alan Gilbert     }
2621329c9b10SDr. David Alan Gilbert 
2622733252deSDr. David Alan Gilbert     trace_qemu_rdma_connect_pin_all_outcome(rdma->pin_all);
2623329c9b10SDr. David Alan Gilbert 
2624329c9b10SDr. David Alan Gilbert     rdma_ack_cm_event(cm_event);
2625329c9b10SDr. David Alan Gilbert 
2626329c9b10SDr. David Alan Gilbert     rdma->control_ready_expected = 1;
2627329c9b10SDr. David Alan Gilbert     rdma->nb_sent = 0;
2628329c9b10SDr. David Alan Gilbert     return 0;
2629329c9b10SDr. David Alan Gilbert 
2630329c9b10SDr. David Alan Gilbert err_rdma_source_connect:
2631329c9b10SDr. David Alan Gilbert     qemu_rdma_cleanup(rdma);
2632329c9b10SDr. David Alan Gilbert     return -1;
2633329c9b10SDr. David Alan Gilbert }
2634329c9b10SDr. David Alan Gilbert 
2635329c9b10SDr. David Alan Gilbert static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp)
2636329c9b10SDr. David Alan Gilbert {
26371dbd2fd9SMichael Tokarev     int ret, idx;
2638329c9b10SDr. David Alan Gilbert     struct rdma_cm_id *listen_id;
2639329c9b10SDr. David Alan Gilbert     char ip[40] = "unknown";
26401dbd2fd9SMichael Tokarev     struct rdma_addrinfo *res, *e;
2641329c9b10SDr. David Alan Gilbert     char port_str[16];
2642f736e414SJack Wang     int reuse = 1;
2643329c9b10SDr. David Alan Gilbert 
2644329c9b10SDr. David Alan Gilbert     for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2645329c9b10SDr. David Alan Gilbert         rdma->wr_data[idx].control_len = 0;
2646329c9b10SDr. David Alan Gilbert         rdma->wr_data[idx].control_curr = NULL;
2647329c9b10SDr. David Alan Gilbert     }
2648329c9b10SDr. David Alan Gilbert 
26491dbd2fd9SMichael Tokarev     if (!rdma->host || !rdma->host[0]) {
2650329c9b10SDr. David Alan Gilbert         ERROR(errp, "RDMA host is not set!");
2651329c9b10SDr. David Alan Gilbert         rdma->error_state = -EINVAL;
2652329c9b10SDr. David Alan Gilbert         return -1;
2653329c9b10SDr. David Alan Gilbert     }
2654329c9b10SDr. David Alan Gilbert     /* create CM channel */
2655329c9b10SDr. David Alan Gilbert     rdma->channel = rdma_create_event_channel();
2656329c9b10SDr. David Alan Gilbert     if (!rdma->channel) {
2657329c9b10SDr. David Alan Gilbert         ERROR(errp, "could not create rdma event channel");
2658329c9b10SDr. David Alan Gilbert         rdma->error_state = -EINVAL;
2659329c9b10SDr. David Alan Gilbert         return -1;
2660329c9b10SDr. David Alan Gilbert     }
2661329c9b10SDr. David Alan Gilbert 
2662329c9b10SDr. David Alan Gilbert     /* create CM id */
2663329c9b10SDr. David Alan Gilbert     ret = rdma_create_id(rdma->channel, &listen_id, NULL, RDMA_PS_TCP);
2664329c9b10SDr. David Alan Gilbert     if (ret) {
2665329c9b10SDr. David Alan Gilbert         ERROR(errp, "could not create cm_id!");
2666329c9b10SDr. David Alan Gilbert         goto err_dest_init_create_listen_id;
2667329c9b10SDr. David Alan Gilbert     }
2668329c9b10SDr. David Alan Gilbert 
2669329c9b10SDr. David Alan Gilbert     snprintf(port_str, 16, "%d", rdma->port);
2670329c9b10SDr. David Alan Gilbert     port_str[15] = '\0';
2671329c9b10SDr. David Alan Gilbert 
2672329c9b10SDr. David Alan Gilbert     ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
2673329c9b10SDr. David Alan Gilbert     if (ret < 0) {
2674329c9b10SDr. David Alan Gilbert         ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
2675329c9b10SDr. David Alan Gilbert         goto err_dest_init_bind_addr;
2676329c9b10SDr. David Alan Gilbert     }
2677329c9b10SDr. David Alan Gilbert 
2678f736e414SJack Wang     ret = rdma_set_option(listen_id, RDMA_OPTION_ID, RDMA_OPTION_ID_REUSEADDR,
2679f736e414SJack Wang                           &reuse, sizeof reuse);
2680f736e414SJack Wang     if (ret) {
2681f736e414SJack Wang         ERROR(errp, "Error: could not set REUSEADDR option");
2682f736e414SJack Wang         goto err_dest_init_bind_addr;
2683f736e414SJack Wang     }
2684329c9b10SDr. David Alan Gilbert     for (e = res; e != NULL; e = e->ai_next) {
2685329c9b10SDr. David Alan Gilbert         inet_ntop(e->ai_family,
2686329c9b10SDr. David Alan Gilbert             &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
2687733252deSDr. David Alan Gilbert         trace_qemu_rdma_dest_init_trying(rdma->host, ip);
2688329c9b10SDr. David Alan Gilbert         ret = rdma_bind_addr(listen_id, e->ai_dst_addr);
26891dbd2fd9SMichael Tokarev         if (ret) {
26901dbd2fd9SMichael Tokarev             continue;
26911dbd2fd9SMichael Tokarev         }
2692329c9b10SDr. David Alan Gilbert         if (e->ai_family == AF_INET6) {
2693bbfb89e3SFam Zheng             ret = qemu_rdma_broken_ipv6_kernel(listen_id->verbs, errp);
2694329c9b10SDr. David Alan Gilbert             if (ret) {
2695329c9b10SDr. David Alan Gilbert                 continue;
2696329c9b10SDr. David Alan Gilbert             }
2697329c9b10SDr. David Alan Gilbert         }
26981dbd2fd9SMichael Tokarev         break;
2699329c9b10SDr. David Alan Gilbert     }
2700329c9b10SDr. David Alan Gilbert 
2701f53b450aSLi Zhijian     rdma_freeaddrinfo(res);
27021dbd2fd9SMichael Tokarev     if (!e) {
2703329c9b10SDr. David Alan Gilbert         ERROR(errp, "Error: could not rdma_bind_addr!");
2704329c9b10SDr. David Alan Gilbert         goto err_dest_init_bind_addr;
2705329c9b10SDr. David Alan Gilbert     }
2706329c9b10SDr. David Alan Gilbert 
2707329c9b10SDr. David Alan Gilbert     rdma->listen_id = listen_id;
2708329c9b10SDr. David Alan Gilbert     qemu_rdma_dump_gid("dest_init", listen_id);
2709329c9b10SDr. David Alan Gilbert     return 0;
2710329c9b10SDr. David Alan Gilbert 
2711329c9b10SDr. David Alan Gilbert err_dest_init_bind_addr:
2712329c9b10SDr. David Alan Gilbert     rdma_destroy_id(listen_id);
2713329c9b10SDr. David Alan Gilbert err_dest_init_create_listen_id:
2714329c9b10SDr. David Alan Gilbert     rdma_destroy_event_channel(rdma->channel);
2715329c9b10SDr. David Alan Gilbert     rdma->channel = NULL;
2716329c9b10SDr. David Alan Gilbert     rdma->error_state = ret;
2717329c9b10SDr. David Alan Gilbert     return ret;
2718329c9b10SDr. David Alan Gilbert 
2719329c9b10SDr. David Alan Gilbert }
2720329c9b10SDr. David Alan Gilbert 
272155cc1b59SLidong Chen static void qemu_rdma_return_path_dest_init(RDMAContext *rdma_return_path,
272255cc1b59SLidong Chen                                             RDMAContext *rdma)
272355cc1b59SLidong Chen {
272455cc1b59SLidong Chen     int idx;
272555cc1b59SLidong Chen 
272655cc1b59SLidong Chen     for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
272755cc1b59SLidong Chen         rdma_return_path->wr_data[idx].control_len = 0;
272855cc1b59SLidong Chen         rdma_return_path->wr_data[idx].control_curr = NULL;
272955cc1b59SLidong Chen     }
273055cc1b59SLidong Chen 
273155cc1b59SLidong Chen     /*the CM channel and CM id is shared*/
273255cc1b59SLidong Chen     rdma_return_path->channel = rdma->channel;
273355cc1b59SLidong Chen     rdma_return_path->listen_id = rdma->listen_id;
273455cc1b59SLidong Chen 
273555cc1b59SLidong Chen     rdma->return_path = rdma_return_path;
273655cc1b59SLidong Chen     rdma_return_path->return_path = rdma;
273755cc1b59SLidong Chen     rdma_return_path->is_return_path = true;
273855cc1b59SLidong Chen }
273955cc1b59SLidong Chen 
2740329c9b10SDr. David Alan Gilbert static void *qemu_rdma_data_init(const char *host_port, Error **errp)
2741329c9b10SDr. David Alan Gilbert {
2742329c9b10SDr. David Alan Gilbert     RDMAContext *rdma = NULL;
2743329c9b10SDr. David Alan Gilbert     InetSocketAddress *addr;
2744329c9b10SDr. David Alan Gilbert 
2745329c9b10SDr. David Alan Gilbert     if (host_port) {
274697f3ad35SMarkus Armbruster         rdma = g_new0(RDMAContext, 1);
2747329c9b10SDr. David Alan Gilbert         rdma->current_index = -1;
2748329c9b10SDr. David Alan Gilbert         rdma->current_chunk = -1;
2749329c9b10SDr. David Alan Gilbert 
27500785bd7aSMarkus Armbruster         addr = g_new(InetSocketAddress, 1);
27510785bd7aSMarkus Armbruster         if (!inet_parse(addr, host_port, NULL)) {
2752329c9b10SDr. David Alan Gilbert             rdma->port = atoi(addr->port);
2753329c9b10SDr. David Alan Gilbert             rdma->host = g_strdup(addr->host);
275444bcfd45SLi Zhijian             rdma->host_port = g_strdup(host_port);
2755329c9b10SDr. David Alan Gilbert         } else {
2756329c9b10SDr. David Alan Gilbert             ERROR(errp, "bad RDMA migration address '%s'", host_port);
2757329c9b10SDr. David Alan Gilbert             g_free(rdma);
2758329c9b10SDr. David Alan Gilbert             rdma = NULL;
2759329c9b10SDr. David Alan Gilbert         }
2760329c9b10SDr. David Alan Gilbert 
2761329c9b10SDr. David Alan Gilbert         qapi_free_InetSocketAddress(addr);
2762329c9b10SDr. David Alan Gilbert     }
2763329c9b10SDr. David Alan Gilbert 
2764329c9b10SDr. David Alan Gilbert     return rdma;
2765329c9b10SDr. David Alan Gilbert }
2766329c9b10SDr. David Alan Gilbert 
2767329c9b10SDr. David Alan Gilbert /*
2768329c9b10SDr. David Alan Gilbert  * QEMUFile interface to the control channel.
2769329c9b10SDr. David Alan Gilbert  * SEND messages for control only.
2770329c9b10SDr. David Alan Gilbert  * VM's ram is handled with regular RDMA messages.
2771329c9b10SDr. David Alan Gilbert  */
27726ddd2d76SDaniel P. Berrange static ssize_t qio_channel_rdma_writev(QIOChannel *ioc,
27736ddd2d76SDaniel P. Berrange                                        const struct iovec *iov,
27746ddd2d76SDaniel P. Berrange                                        size_t niov,
27756ddd2d76SDaniel P. Berrange                                        int *fds,
27766ddd2d76SDaniel P. Berrange                                        size_t nfds,
2777b88651cbSLeonardo Bras                                        int flags,
27786ddd2d76SDaniel P. Berrange                                        Error **errp)
2779329c9b10SDr. David Alan Gilbert {
27806ddd2d76SDaniel P. Berrange     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
27816ddd2d76SDaniel P. Berrange     QEMUFile *f = rioc->file;
278274637e6fSLidong Chen     RDMAContext *rdma;
2783329c9b10SDr. David Alan Gilbert     int ret;
27846ddd2d76SDaniel P. Berrange     ssize_t done = 0;
27856ddd2d76SDaniel P. Berrange     size_t i;
2786f38f6d41SLidong Chen     size_t len = 0;
2787329c9b10SDr. David Alan Gilbert 
2788987ab2a5SDr. David Alan Gilbert     RCU_READ_LOCK_GUARD();
2789d73415a3SStefan Hajnoczi     rdma = qatomic_rcu_read(&rioc->rdmaout);
279074637e6fSLidong Chen 
279174637e6fSLidong Chen     if (!rdma) {
279274ecf6acSFiona Ebner         error_setg(errp, "RDMA control channel output is not set");
279374ecf6acSFiona Ebner         return -1;
279474637e6fSLidong Chen     }
279574637e6fSLidong Chen 
2796329c9b10SDr. David Alan Gilbert     CHECK_ERROR_STATE();
2797329c9b10SDr. David Alan Gilbert 
2798329c9b10SDr. David Alan Gilbert     /*
2799329c9b10SDr. David Alan Gilbert      * Push out any writes that
2800329c9b10SDr. David Alan Gilbert      * we're queued up for VM's ram.
2801329c9b10SDr. David Alan Gilbert      */
2802329c9b10SDr. David Alan Gilbert     ret = qemu_rdma_write_flush(f, rdma);
2803329c9b10SDr. David Alan Gilbert     if (ret < 0) {
2804329c9b10SDr. David Alan Gilbert         rdma->error_state = ret;
280574ecf6acSFiona Ebner         error_setg(errp, "qemu_rdma_write_flush returned %d", ret);
280674ecf6acSFiona Ebner         return -1;
2807329c9b10SDr. David Alan Gilbert     }
2808329c9b10SDr. David Alan Gilbert 
28096ddd2d76SDaniel P. Berrange     for (i = 0; i < niov; i++) {
28106ddd2d76SDaniel P. Berrange         size_t remaining = iov[i].iov_len;
28116ddd2d76SDaniel P. Berrange         uint8_t * data = (void *)iov[i].iov_base;
2812329c9b10SDr. David Alan Gilbert         while (remaining) {
2813329c9b10SDr. David Alan Gilbert             RDMAControlHeader head;
2814329c9b10SDr. David Alan Gilbert 
2815f38f6d41SLidong Chen             len = MIN(remaining, RDMA_SEND_INCREMENT);
2816f38f6d41SLidong Chen             remaining -= len;
2817329c9b10SDr. David Alan Gilbert 
2818f38f6d41SLidong Chen             head.len = len;
2819329c9b10SDr. David Alan Gilbert             head.type = RDMA_CONTROL_QEMU_FILE;
2820329c9b10SDr. David Alan Gilbert 
2821329c9b10SDr. David Alan Gilbert             ret = qemu_rdma_exchange_send(rdma, &head, data, NULL, NULL, NULL);
2822329c9b10SDr. David Alan Gilbert 
2823329c9b10SDr. David Alan Gilbert             if (ret < 0) {
2824329c9b10SDr. David Alan Gilbert                 rdma->error_state = ret;
282574ecf6acSFiona Ebner                 error_setg(errp, "qemu_rdma_exchange_send returned %d", ret);
282674ecf6acSFiona Ebner                 return -1;
2827329c9b10SDr. David Alan Gilbert             }
2828329c9b10SDr. David Alan Gilbert 
2829f38f6d41SLidong Chen             data += len;
2830f38f6d41SLidong Chen             done += len;
28316ddd2d76SDaniel P. Berrange         }
2832329c9b10SDr. David Alan Gilbert     }
2833329c9b10SDr. David Alan Gilbert 
28346ddd2d76SDaniel P. Berrange     return done;
2835329c9b10SDr. David Alan Gilbert }
2836329c9b10SDr. David Alan Gilbert 
2837329c9b10SDr. David Alan Gilbert static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf,
2838a202a4c0SDr. David Alan Gilbert                              size_t size, int idx)
2839329c9b10SDr. David Alan Gilbert {
2840329c9b10SDr. David Alan Gilbert     size_t len = 0;
2841329c9b10SDr. David Alan Gilbert 
2842329c9b10SDr. David Alan Gilbert     if (rdma->wr_data[idx].control_len) {
2843733252deSDr. David Alan Gilbert         trace_qemu_rdma_fill(rdma->wr_data[idx].control_len, size);
2844329c9b10SDr. David Alan Gilbert 
2845329c9b10SDr. David Alan Gilbert         len = MIN(size, rdma->wr_data[idx].control_len);
2846329c9b10SDr. David Alan Gilbert         memcpy(buf, rdma->wr_data[idx].control_curr, len);
2847329c9b10SDr. David Alan Gilbert         rdma->wr_data[idx].control_curr += len;
2848329c9b10SDr. David Alan Gilbert         rdma->wr_data[idx].control_len -= len;
2849329c9b10SDr. David Alan Gilbert     }
2850329c9b10SDr. David Alan Gilbert 
2851329c9b10SDr. David Alan Gilbert     return len;
2852329c9b10SDr. David Alan Gilbert }
2853329c9b10SDr. David Alan Gilbert 
2854329c9b10SDr. David Alan Gilbert /*
2855329c9b10SDr. David Alan Gilbert  * QEMUFile interface to the control channel.
2856329c9b10SDr. David Alan Gilbert  * RDMA links don't use bytestreams, so we have to
2857329c9b10SDr. David Alan Gilbert  * return bytes to QEMUFile opportunistically.
2858329c9b10SDr. David Alan Gilbert  */
28596ddd2d76SDaniel P. Berrange static ssize_t qio_channel_rdma_readv(QIOChannel *ioc,
28606ddd2d76SDaniel P. Berrange                                       const struct iovec *iov,
28616ddd2d76SDaniel P. Berrange                                       size_t niov,
28626ddd2d76SDaniel P. Berrange                                       int **fds,
28636ddd2d76SDaniel P. Berrange                                       size_t *nfds,
286484615a19Smanish.mishra                                       int flags,
28656ddd2d76SDaniel P. Berrange                                       Error **errp)
2866329c9b10SDr. David Alan Gilbert {
28676ddd2d76SDaniel P. Berrange     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
286874637e6fSLidong Chen     RDMAContext *rdma;
2869329c9b10SDr. David Alan Gilbert     RDMAControlHeader head;
2870329c9b10SDr. David Alan Gilbert     int ret = 0;
28716ddd2d76SDaniel P. Berrange     ssize_t i;
28726ddd2d76SDaniel P. Berrange     size_t done = 0;
2873329c9b10SDr. David Alan Gilbert 
2874987ab2a5SDr. David Alan Gilbert     RCU_READ_LOCK_GUARD();
2875d73415a3SStefan Hajnoczi     rdma = qatomic_rcu_read(&rioc->rdmain);
287674637e6fSLidong Chen 
287774637e6fSLidong Chen     if (!rdma) {
287874ecf6acSFiona Ebner         error_setg(errp, "RDMA control channel input is not set");
287974ecf6acSFiona Ebner         return -1;
288074637e6fSLidong Chen     }
288174637e6fSLidong Chen 
2882329c9b10SDr. David Alan Gilbert     CHECK_ERROR_STATE();
2883329c9b10SDr. David Alan Gilbert 
28846ddd2d76SDaniel P. Berrange     for (i = 0; i < niov; i++) {
28856ddd2d76SDaniel P. Berrange         size_t want = iov[i].iov_len;
28866ddd2d76SDaniel P. Berrange         uint8_t *data = (void *)iov[i].iov_base;
28876ddd2d76SDaniel P. Berrange 
2888329c9b10SDr. David Alan Gilbert         /*
2889329c9b10SDr. David Alan Gilbert          * First, we hold on to the last SEND message we
2890329c9b10SDr. David Alan Gilbert          * were given and dish out the bytes until we run
2891329c9b10SDr. David Alan Gilbert          * out of bytes.
2892329c9b10SDr. David Alan Gilbert          */
289374637e6fSLidong Chen         ret = qemu_rdma_fill(rdma, data, want, 0);
28946ddd2d76SDaniel P. Berrange         done += ret;
28956ddd2d76SDaniel P. Berrange         want -= ret;
28966ddd2d76SDaniel P. Berrange         /* Got what we needed, so go to next iovec */
28976ddd2d76SDaniel P. Berrange         if (want == 0) {
28986ddd2d76SDaniel P. Berrange             continue;
2899329c9b10SDr. David Alan Gilbert         }
2900329c9b10SDr. David Alan Gilbert 
29016ddd2d76SDaniel P. Berrange         /* If we got any data so far, then don't wait
29026ddd2d76SDaniel P. Berrange          * for more, just return what we have */
29036ddd2d76SDaniel P. Berrange         if (done > 0) {
29046ddd2d76SDaniel P. Berrange             break;
29056ddd2d76SDaniel P. Berrange         }
29066ddd2d76SDaniel P. Berrange 
29076ddd2d76SDaniel P. Berrange 
29086ddd2d76SDaniel P. Berrange         /* We've got nothing at all, so lets wait for
29096ddd2d76SDaniel P. Berrange          * more to arrive
2910329c9b10SDr. David Alan Gilbert          */
2911329c9b10SDr. David Alan Gilbert         ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_QEMU_FILE);
2912329c9b10SDr. David Alan Gilbert 
2913329c9b10SDr. David Alan Gilbert         if (ret < 0) {
2914329c9b10SDr. David Alan Gilbert             rdma->error_state = ret;
291574ecf6acSFiona Ebner             error_setg(errp, "qemu_rdma_exchange_recv returned %d", ret);
291674ecf6acSFiona Ebner             return -1;
2917329c9b10SDr. David Alan Gilbert         }
2918329c9b10SDr. David Alan Gilbert 
2919329c9b10SDr. David Alan Gilbert         /*
2920329c9b10SDr. David Alan Gilbert          * SEND was received with new bytes, now try again.
2921329c9b10SDr. David Alan Gilbert          */
292274637e6fSLidong Chen         ret = qemu_rdma_fill(rdma, data, want, 0);
29236ddd2d76SDaniel P. Berrange         done += ret;
29246ddd2d76SDaniel P. Berrange         want -= ret;
29256ddd2d76SDaniel P. Berrange 
29266ddd2d76SDaniel P. Berrange         /* Still didn't get enough, so lets just return */
29276ddd2d76SDaniel P. Berrange         if (want) {
29286ddd2d76SDaniel P. Berrange             if (done == 0) {
29296ddd2d76SDaniel P. Berrange                 return QIO_CHANNEL_ERR_BLOCK;
29306ddd2d76SDaniel P. Berrange             } else {
29316ddd2d76SDaniel P. Berrange                 break;
29326ddd2d76SDaniel P. Berrange             }
29336ddd2d76SDaniel P. Berrange         }
29346ddd2d76SDaniel P. Berrange     }
2935f38f6d41SLidong Chen     return done;
2936329c9b10SDr. David Alan Gilbert }
2937329c9b10SDr. David Alan Gilbert 
2938329c9b10SDr. David Alan Gilbert /*
2939329c9b10SDr. David Alan Gilbert  * Block until all the outstanding chunks have been delivered by the hardware.
2940329c9b10SDr. David Alan Gilbert  */
2941329c9b10SDr. David Alan Gilbert static int qemu_rdma_drain_cq(QEMUFile *f, RDMAContext *rdma)
2942329c9b10SDr. David Alan Gilbert {
2943329c9b10SDr. David Alan Gilbert     int ret;
2944329c9b10SDr. David Alan Gilbert 
2945329c9b10SDr. David Alan Gilbert     if (qemu_rdma_write_flush(f, rdma) < 0) {
2946329c9b10SDr. David Alan Gilbert         return -EIO;
2947329c9b10SDr. David Alan Gilbert     }
2948329c9b10SDr. David Alan Gilbert 
2949329c9b10SDr. David Alan Gilbert     while (rdma->nb_sent) {
2950329c9b10SDr. David Alan Gilbert         ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2951329c9b10SDr. David Alan Gilbert         if (ret < 0) {
2952733252deSDr. David Alan Gilbert             error_report("rdma migration: complete polling error!");
2953329c9b10SDr. David Alan Gilbert             return -EIO;
2954329c9b10SDr. David Alan Gilbert         }
2955329c9b10SDr. David Alan Gilbert     }
2956329c9b10SDr. David Alan Gilbert 
2957329c9b10SDr. David Alan Gilbert     qemu_rdma_unregister_waiting(rdma);
2958329c9b10SDr. David Alan Gilbert 
2959329c9b10SDr. David Alan Gilbert     return 0;
2960329c9b10SDr. David Alan Gilbert }
2961329c9b10SDr. David Alan Gilbert 
29626ddd2d76SDaniel P. Berrange 
29636ddd2d76SDaniel P. Berrange static int qio_channel_rdma_set_blocking(QIOChannel *ioc,
29646ddd2d76SDaniel P. Berrange                                          bool blocking,
29656ddd2d76SDaniel P. Berrange                                          Error **errp)
2966329c9b10SDr. David Alan Gilbert {
29676ddd2d76SDaniel P. Berrange     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
29686ddd2d76SDaniel P. Berrange     /* XXX we should make readv/writev actually honour this :-) */
29696ddd2d76SDaniel P. Berrange     rioc->blocking = blocking;
29706ddd2d76SDaniel P. Berrange     return 0;
2971329c9b10SDr. David Alan Gilbert }
29726ddd2d76SDaniel P. Berrange 
29736ddd2d76SDaniel P. Berrange 
29746ddd2d76SDaniel P. Berrange typedef struct QIOChannelRDMASource QIOChannelRDMASource;
29756ddd2d76SDaniel P. Berrange struct QIOChannelRDMASource {
29766ddd2d76SDaniel P. Berrange     GSource parent;
29776ddd2d76SDaniel P. Berrange     QIOChannelRDMA *rioc;
29786ddd2d76SDaniel P. Berrange     GIOCondition condition;
29796ddd2d76SDaniel P. Berrange };
29806ddd2d76SDaniel P. Berrange 
29816ddd2d76SDaniel P. Berrange static gboolean
29826ddd2d76SDaniel P. Berrange qio_channel_rdma_source_prepare(GSource *source,
29836ddd2d76SDaniel P. Berrange                                 gint *timeout)
29846ddd2d76SDaniel P. Berrange {
29856ddd2d76SDaniel P. Berrange     QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
298674637e6fSLidong Chen     RDMAContext *rdma;
29876ddd2d76SDaniel P. Berrange     GIOCondition cond = 0;
29886ddd2d76SDaniel P. Berrange     *timeout = -1;
29896ddd2d76SDaniel P. Berrange 
2990987ab2a5SDr. David Alan Gilbert     RCU_READ_LOCK_GUARD();
299174637e6fSLidong Chen     if (rsource->condition == G_IO_IN) {
2992d73415a3SStefan Hajnoczi         rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
299374637e6fSLidong Chen     } else {
2994d73415a3SStefan Hajnoczi         rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
299574637e6fSLidong Chen     }
299674637e6fSLidong Chen 
299774637e6fSLidong Chen     if (!rdma) {
299874637e6fSLidong Chen         error_report("RDMAContext is NULL when prepare Gsource");
299974637e6fSLidong Chen         return FALSE;
300074637e6fSLidong Chen     }
300174637e6fSLidong Chen 
30026ddd2d76SDaniel P. Berrange     if (rdma->wr_data[0].control_len) {
30036ddd2d76SDaniel P. Berrange         cond |= G_IO_IN;
30046ddd2d76SDaniel P. Berrange     }
30056ddd2d76SDaniel P. Berrange     cond |= G_IO_OUT;
30066ddd2d76SDaniel P. Berrange 
30076ddd2d76SDaniel P. Berrange     return cond & rsource->condition;
30086ddd2d76SDaniel P. Berrange }
30096ddd2d76SDaniel P. Berrange 
30106ddd2d76SDaniel P. Berrange static gboolean
30116ddd2d76SDaniel P. Berrange qio_channel_rdma_source_check(GSource *source)
30126ddd2d76SDaniel P. Berrange {
30136ddd2d76SDaniel P. Berrange     QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
301474637e6fSLidong Chen     RDMAContext *rdma;
30156ddd2d76SDaniel P. Berrange     GIOCondition cond = 0;
30166ddd2d76SDaniel P. Berrange 
3017987ab2a5SDr. David Alan Gilbert     RCU_READ_LOCK_GUARD();
301874637e6fSLidong Chen     if (rsource->condition == G_IO_IN) {
3019d73415a3SStefan Hajnoczi         rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
302074637e6fSLidong Chen     } else {
3021d73415a3SStefan Hajnoczi         rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
302274637e6fSLidong Chen     }
302374637e6fSLidong Chen 
302474637e6fSLidong Chen     if (!rdma) {
302574637e6fSLidong Chen         error_report("RDMAContext is NULL when check Gsource");
302674637e6fSLidong Chen         return FALSE;
302774637e6fSLidong Chen     }
302874637e6fSLidong Chen 
30296ddd2d76SDaniel P. Berrange     if (rdma->wr_data[0].control_len) {
30306ddd2d76SDaniel P. Berrange         cond |= G_IO_IN;
30316ddd2d76SDaniel P. Berrange     }
30326ddd2d76SDaniel P. Berrange     cond |= G_IO_OUT;
30336ddd2d76SDaniel P. Berrange 
30346ddd2d76SDaniel P. Berrange     return cond & rsource->condition;
30356ddd2d76SDaniel P. Berrange }
30366ddd2d76SDaniel P. Berrange 
30376ddd2d76SDaniel P. Berrange static gboolean
30386ddd2d76SDaniel P. Berrange qio_channel_rdma_source_dispatch(GSource *source,
30396ddd2d76SDaniel P. Berrange                                  GSourceFunc callback,
30406ddd2d76SDaniel P. Berrange                                  gpointer user_data)
30416ddd2d76SDaniel P. Berrange {
30426ddd2d76SDaniel P. Berrange     QIOChannelFunc func = (QIOChannelFunc)callback;
30436ddd2d76SDaniel P. Berrange     QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
304474637e6fSLidong Chen     RDMAContext *rdma;
30456ddd2d76SDaniel P. Berrange     GIOCondition cond = 0;
30466ddd2d76SDaniel P. Berrange 
3047987ab2a5SDr. David Alan Gilbert     RCU_READ_LOCK_GUARD();
304874637e6fSLidong Chen     if (rsource->condition == G_IO_IN) {
3049d73415a3SStefan Hajnoczi         rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
305074637e6fSLidong Chen     } else {
3051d73415a3SStefan Hajnoczi         rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
305274637e6fSLidong Chen     }
305374637e6fSLidong Chen 
305474637e6fSLidong Chen     if (!rdma) {
305574637e6fSLidong Chen         error_report("RDMAContext is NULL when dispatch Gsource");
305674637e6fSLidong Chen         return FALSE;
305774637e6fSLidong Chen     }
305874637e6fSLidong Chen 
30596ddd2d76SDaniel P. Berrange     if (rdma->wr_data[0].control_len) {
30606ddd2d76SDaniel P. Berrange         cond |= G_IO_IN;
30616ddd2d76SDaniel P. Berrange     }
30626ddd2d76SDaniel P. Berrange     cond |= G_IO_OUT;
30636ddd2d76SDaniel P. Berrange 
30646ddd2d76SDaniel P. Berrange     return (*func)(QIO_CHANNEL(rsource->rioc),
30656ddd2d76SDaniel P. Berrange                    (cond & rsource->condition),
30666ddd2d76SDaniel P. Berrange                    user_data);
30676ddd2d76SDaniel P. Berrange }
30686ddd2d76SDaniel P. Berrange 
30696ddd2d76SDaniel P. Berrange static void
30706ddd2d76SDaniel P. Berrange qio_channel_rdma_source_finalize(GSource *source)
30716ddd2d76SDaniel P. Berrange {
30726ddd2d76SDaniel P. Berrange     QIOChannelRDMASource *ssource = (QIOChannelRDMASource *)source;
30736ddd2d76SDaniel P. Berrange 
30746ddd2d76SDaniel P. Berrange     object_unref(OBJECT(ssource->rioc));
30756ddd2d76SDaniel P. Berrange }
30766ddd2d76SDaniel P. Berrange 
30776ddd2d76SDaniel P. Berrange GSourceFuncs qio_channel_rdma_source_funcs = {
30786ddd2d76SDaniel P. Berrange     qio_channel_rdma_source_prepare,
30796ddd2d76SDaniel P. Berrange     qio_channel_rdma_source_check,
30806ddd2d76SDaniel P. Berrange     qio_channel_rdma_source_dispatch,
30816ddd2d76SDaniel P. Berrange     qio_channel_rdma_source_finalize
30826ddd2d76SDaniel P. Berrange };
30836ddd2d76SDaniel P. Berrange 
30846ddd2d76SDaniel P. Berrange static GSource *qio_channel_rdma_create_watch(QIOChannel *ioc,
30856ddd2d76SDaniel P. Berrange                                               GIOCondition condition)
30866ddd2d76SDaniel P. Berrange {
30876ddd2d76SDaniel P. Berrange     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
30886ddd2d76SDaniel P. Berrange     QIOChannelRDMASource *ssource;
30896ddd2d76SDaniel P. Berrange     GSource *source;
30906ddd2d76SDaniel P. Berrange 
30916ddd2d76SDaniel P. Berrange     source = g_source_new(&qio_channel_rdma_source_funcs,
30926ddd2d76SDaniel P. Berrange                           sizeof(QIOChannelRDMASource));
30936ddd2d76SDaniel P. Berrange     ssource = (QIOChannelRDMASource *)source;
30946ddd2d76SDaniel P. Berrange 
30956ddd2d76SDaniel P. Berrange     ssource->rioc = rioc;
30966ddd2d76SDaniel P. Berrange     object_ref(OBJECT(rioc));
30976ddd2d76SDaniel P. Berrange 
30986ddd2d76SDaniel P. Berrange     ssource->condition = condition;
30996ddd2d76SDaniel P. Berrange 
31006ddd2d76SDaniel P. Berrange     return source;
31016ddd2d76SDaniel P. Berrange }
31026ddd2d76SDaniel P. Berrange 
31034d9f675bSLidong Chen static void qio_channel_rdma_set_aio_fd_handler(QIOChannel *ioc,
31044d9f675bSLidong Chen                                                   AioContext *ctx,
31054d9f675bSLidong Chen                                                   IOHandler *io_read,
31064d9f675bSLidong Chen                                                   IOHandler *io_write,
31074d9f675bSLidong Chen                                                   void *opaque)
31084d9f675bSLidong Chen {
31094d9f675bSLidong Chen     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
31104d9f675bSLidong Chen     if (io_read) {
3111b390afd8SLi Zhijian         aio_set_fd_handler(ctx, rioc->rdmain->recv_comp_channel->fd,
3112826cc324SStefan Hajnoczi                            false, io_read, io_write, NULL, NULL, opaque);
3113b390afd8SLi Zhijian         aio_set_fd_handler(ctx, rioc->rdmain->send_comp_channel->fd,
3114826cc324SStefan Hajnoczi                            false, io_read, io_write, NULL, NULL, opaque);
31154d9f675bSLidong Chen     } else {
3116b390afd8SLi Zhijian         aio_set_fd_handler(ctx, rioc->rdmaout->recv_comp_channel->fd,
3117826cc324SStefan Hajnoczi                            false, io_read, io_write, NULL, NULL, opaque);
3118b390afd8SLi Zhijian         aio_set_fd_handler(ctx, rioc->rdmaout->send_comp_channel->fd,
3119826cc324SStefan Hajnoczi                            false, io_read, io_write, NULL, NULL, opaque);
31204d9f675bSLidong Chen     }
31214d9f675bSLidong Chen }
31226ddd2d76SDaniel P. Berrange 
3123d46a4847SDr. David Alan Gilbert struct rdma_close_rcu {
3124d46a4847SDr. David Alan Gilbert     struct rcu_head rcu;
3125d46a4847SDr. David Alan Gilbert     RDMAContext *rdmain;
3126d46a4847SDr. David Alan Gilbert     RDMAContext *rdmaout;
3127d46a4847SDr. David Alan Gilbert };
3128d46a4847SDr. David Alan Gilbert 
3129d46a4847SDr. David Alan Gilbert /* callback from qio_channel_rdma_close via call_rcu */
3130d46a4847SDr. David Alan Gilbert static void qio_channel_rdma_close_rcu(struct rdma_close_rcu *rcu)
3131d46a4847SDr. David Alan Gilbert {
3132d46a4847SDr. David Alan Gilbert     if (rcu->rdmain) {
3133d46a4847SDr. David Alan Gilbert         qemu_rdma_cleanup(rcu->rdmain);
3134d46a4847SDr. David Alan Gilbert     }
3135d46a4847SDr. David Alan Gilbert 
3136d46a4847SDr. David Alan Gilbert     if (rcu->rdmaout) {
3137d46a4847SDr. David Alan Gilbert         qemu_rdma_cleanup(rcu->rdmaout);
3138d46a4847SDr. David Alan Gilbert     }
3139d46a4847SDr. David Alan Gilbert 
3140d46a4847SDr. David Alan Gilbert     g_free(rcu->rdmain);
3141d46a4847SDr. David Alan Gilbert     g_free(rcu->rdmaout);
3142d46a4847SDr. David Alan Gilbert     g_free(rcu);
3143d46a4847SDr. David Alan Gilbert }
3144d46a4847SDr. David Alan Gilbert 
31456ddd2d76SDaniel P. Berrange static int qio_channel_rdma_close(QIOChannel *ioc,
31466ddd2d76SDaniel P. Berrange                                   Error **errp)
31476ddd2d76SDaniel P. Berrange {
31486ddd2d76SDaniel P. Berrange     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
314974637e6fSLidong Chen     RDMAContext *rdmain, *rdmaout;
3150d46a4847SDr. David Alan Gilbert     struct rdma_close_rcu *rcu = g_new(struct rdma_close_rcu, 1);
3151d46a4847SDr. David Alan Gilbert 
31526ddd2d76SDaniel P. Berrange     trace_qemu_rdma_close();
315374637e6fSLidong Chen 
315474637e6fSLidong Chen     rdmain = rioc->rdmain;
315574637e6fSLidong Chen     if (rdmain) {
3156d73415a3SStefan Hajnoczi         qatomic_rcu_set(&rioc->rdmain, NULL);
315712c67ffbSDr. David Alan Gilbert     }
315874637e6fSLidong Chen 
315974637e6fSLidong Chen     rdmaout = rioc->rdmaout;
316074637e6fSLidong Chen     if (rdmaout) {
3161d73415a3SStefan Hajnoczi         qatomic_rcu_set(&rioc->rdmaout, NULL);
31626ddd2d76SDaniel P. Berrange     }
316374637e6fSLidong Chen 
3164d46a4847SDr. David Alan Gilbert     rcu->rdmain = rdmain;
3165d46a4847SDr. David Alan Gilbert     rcu->rdmaout = rdmaout;
3166d46a4847SDr. David Alan Gilbert     call_rcu(rcu, qio_channel_rdma_close_rcu, rcu);
316774637e6fSLidong Chen 
3168329c9b10SDr. David Alan Gilbert     return 0;
3169329c9b10SDr. David Alan Gilbert }
3170329c9b10SDr. David Alan Gilbert 
317154db882fSLidong Chen static int
317254db882fSLidong Chen qio_channel_rdma_shutdown(QIOChannel *ioc,
317354db882fSLidong Chen                             QIOChannelShutdown how,
317454db882fSLidong Chen                             Error **errp)
317554db882fSLidong Chen {
317654db882fSLidong Chen     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
317754db882fSLidong Chen     RDMAContext *rdmain, *rdmaout;
317854db882fSLidong Chen 
3179987ab2a5SDr. David Alan Gilbert     RCU_READ_LOCK_GUARD();
318054db882fSLidong Chen 
3181d73415a3SStefan Hajnoczi     rdmain = qatomic_rcu_read(&rioc->rdmain);
3182d73415a3SStefan Hajnoczi     rdmaout = qatomic_rcu_read(&rioc->rdmain);
318354db882fSLidong Chen 
318454db882fSLidong Chen     switch (how) {
318554db882fSLidong Chen     case QIO_CHANNEL_SHUTDOWN_READ:
318654db882fSLidong Chen         if (rdmain) {
318754db882fSLidong Chen             rdmain->error_state = -1;
318854db882fSLidong Chen         }
318954db882fSLidong Chen         break;
319054db882fSLidong Chen     case QIO_CHANNEL_SHUTDOWN_WRITE:
319154db882fSLidong Chen         if (rdmaout) {
319254db882fSLidong Chen             rdmaout->error_state = -1;
319354db882fSLidong Chen         }
319454db882fSLidong Chen         break;
319554db882fSLidong Chen     case QIO_CHANNEL_SHUTDOWN_BOTH:
319654db882fSLidong Chen     default:
319754db882fSLidong Chen         if (rdmain) {
319854db882fSLidong Chen             rdmain->error_state = -1;
319954db882fSLidong Chen         }
320054db882fSLidong Chen         if (rdmaout) {
320154db882fSLidong Chen             rdmaout->error_state = -1;
320254db882fSLidong Chen         }
320354db882fSLidong Chen         break;
320454db882fSLidong Chen     }
320554db882fSLidong Chen 
320654db882fSLidong Chen     return 0;
320754db882fSLidong Chen }
320854db882fSLidong Chen 
3209329c9b10SDr. David Alan Gilbert /*
3210329c9b10SDr. David Alan Gilbert  * Parameters:
3211329c9b10SDr. David Alan Gilbert  *    @offset == 0 :
3212329c9b10SDr. David Alan Gilbert  *        This means that 'block_offset' is a full virtual address that does not
3213329c9b10SDr. David Alan Gilbert  *        belong to a RAMBlock of the virtual machine and instead
3214329c9b10SDr. David Alan Gilbert  *        represents a private malloc'd memory area that the caller wishes to
3215329c9b10SDr. David Alan Gilbert  *        transfer.
3216329c9b10SDr. David Alan Gilbert  *
3217329c9b10SDr. David Alan Gilbert  *    @offset != 0 :
3218329c9b10SDr. David Alan Gilbert  *        Offset is an offset to be added to block_offset and used
3219329c9b10SDr. David Alan Gilbert  *        to also lookup the corresponding RAMBlock.
3220329c9b10SDr. David Alan Gilbert  *
3221246683c2SDaniel P. Berrangé  *    @size : Number of bytes to transfer
3222329c9b10SDr. David Alan Gilbert  *
3223329c9b10SDr. David Alan Gilbert  *    @bytes_sent : User-specificed pointer to indicate how many bytes were
3224329c9b10SDr. David Alan Gilbert  *                  sent. Usually, this will not be more than a few bytes of
3225329c9b10SDr. David Alan Gilbert  *                  the protocol because most transfers are sent asynchronously.
3226329c9b10SDr. David Alan Gilbert  */
3227365c0463SDaniel P. Berrangé static size_t qemu_rdma_save_page(QEMUFile *f,
3228329c9b10SDr. David Alan Gilbert                                   ram_addr_t block_offset, ram_addr_t offset,
32296e1dea46SJuan Quintela                                   size_t size, uint64_t *bytes_sent)
3230329c9b10SDr. David Alan Gilbert {
3231365c0463SDaniel P. Berrangé     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
323274637e6fSLidong Chen     RDMAContext *rdma;
3233329c9b10SDr. David Alan Gilbert     int ret;
3234329c9b10SDr. David Alan Gilbert 
3235987ab2a5SDr. David Alan Gilbert     RCU_READ_LOCK_GUARD();
3236d73415a3SStefan Hajnoczi     rdma = qatomic_rcu_read(&rioc->rdmaout);
323774637e6fSLidong Chen 
323874637e6fSLidong Chen     if (!rdma) {
323974637e6fSLidong Chen         return -EIO;
324074637e6fSLidong Chen     }
324174637e6fSLidong Chen 
3242329c9b10SDr. David Alan Gilbert     CHECK_ERROR_STATE();
3243329c9b10SDr. David Alan Gilbert 
32446a88eb2bSWei Yang     if (migration_in_postcopy()) {
3245ccb7e1b5SLidong Chen         return RAM_SAVE_CONTROL_NOT_SUPP;
3246ccb7e1b5SLidong Chen     }
3247ccb7e1b5SLidong Chen 
3248329c9b10SDr. David Alan Gilbert     qemu_fflush(f);
3249329c9b10SDr. David Alan Gilbert 
3250329c9b10SDr. David Alan Gilbert     /*
3251329c9b10SDr. David Alan Gilbert      * Add this page to the current 'chunk'. If the chunk
32523a4452d8Szhaolichang      * is full, or the page doesn't belong to the current chunk,
3253329c9b10SDr. David Alan Gilbert      * an actual RDMA write will occur and a new chunk will be formed.
3254329c9b10SDr. David Alan Gilbert      */
3255329c9b10SDr. David Alan Gilbert     ret = qemu_rdma_write(f, rdma, block_offset, offset, size);
3256329c9b10SDr. David Alan Gilbert     if (ret < 0) {
3257733252deSDr. David Alan Gilbert         error_report("rdma migration: write error! %d", ret);
3258329c9b10SDr. David Alan Gilbert         goto err;
3259329c9b10SDr. David Alan Gilbert     }
3260329c9b10SDr. David Alan Gilbert 
3261329c9b10SDr. David Alan Gilbert     /*
3262329c9b10SDr. David Alan Gilbert      * We always return 1 bytes because the RDMA
3263329c9b10SDr. David Alan Gilbert      * protocol is completely asynchronous. We do not yet know
3264329c9b10SDr. David Alan Gilbert      * whether an  identified chunk is zero or not because we're
3265329c9b10SDr. David Alan Gilbert      * waiting for other pages to potentially be merged with
3266329c9b10SDr. David Alan Gilbert      * the current chunk. So, we have to call qemu_update_position()
3267329c9b10SDr. David Alan Gilbert      * later on when the actual write occurs.
3268329c9b10SDr. David Alan Gilbert      */
3269329c9b10SDr. David Alan Gilbert     if (bytes_sent) {
3270329c9b10SDr. David Alan Gilbert         *bytes_sent = 1;
3271329c9b10SDr. David Alan Gilbert     }
3272329c9b10SDr. David Alan Gilbert 
3273329c9b10SDr. David Alan Gilbert     /*
3274329c9b10SDr. David Alan Gilbert      * Drain the Completion Queue if possible, but do not block,
3275329c9b10SDr. David Alan Gilbert      * just poll.
3276329c9b10SDr. David Alan Gilbert      *
3277329c9b10SDr. David Alan Gilbert      * If nothing to poll, the end of the iteration will do this
3278329c9b10SDr. David Alan Gilbert      * again to make sure we don't overflow the request queue.
3279329c9b10SDr. David Alan Gilbert      */
3280329c9b10SDr. David Alan Gilbert     while (1) {
3281329c9b10SDr. David Alan Gilbert         uint64_t wr_id, wr_id_in;
3282b390afd8SLi Zhijian         int ret = qemu_rdma_poll(rdma, rdma->recv_cq, &wr_id_in, NULL);
3283b390afd8SLi Zhijian         if (ret < 0) {
3284b390afd8SLi Zhijian             error_report("rdma migration: polling error! %d", ret);
3285b390afd8SLi Zhijian             goto err;
3286b390afd8SLi Zhijian         }
3287b390afd8SLi Zhijian 
3288b390afd8SLi Zhijian         wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
3289b390afd8SLi Zhijian 
3290b390afd8SLi Zhijian         if (wr_id == RDMA_WRID_NONE) {
3291b390afd8SLi Zhijian             break;
3292b390afd8SLi Zhijian         }
3293b390afd8SLi Zhijian     }
3294b390afd8SLi Zhijian 
3295b390afd8SLi Zhijian     while (1) {
3296b390afd8SLi Zhijian         uint64_t wr_id, wr_id_in;
3297b390afd8SLi Zhijian         int ret = qemu_rdma_poll(rdma, rdma->send_cq, &wr_id_in, NULL);
3298329c9b10SDr. David Alan Gilbert         if (ret < 0) {
3299733252deSDr. David Alan Gilbert             error_report("rdma migration: polling error! %d", ret);
3300329c9b10SDr. David Alan Gilbert             goto err;
3301329c9b10SDr. David Alan Gilbert         }
3302329c9b10SDr. David Alan Gilbert 
3303329c9b10SDr. David Alan Gilbert         wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
3304329c9b10SDr. David Alan Gilbert 
3305329c9b10SDr. David Alan Gilbert         if (wr_id == RDMA_WRID_NONE) {
3306329c9b10SDr. David Alan Gilbert             break;
3307329c9b10SDr. David Alan Gilbert         }
3308329c9b10SDr. David Alan Gilbert     }
3309329c9b10SDr. David Alan Gilbert 
3310329c9b10SDr. David Alan Gilbert     return RAM_SAVE_CONTROL_DELAYED;
3311329c9b10SDr. David Alan Gilbert err:
3312329c9b10SDr. David Alan Gilbert     rdma->error_state = ret;
3313329c9b10SDr. David Alan Gilbert     return ret;
3314329c9b10SDr. David Alan Gilbert }
3315329c9b10SDr. David Alan Gilbert 
331655cc1b59SLidong Chen static void rdma_accept_incoming_migration(void *opaque);
331755cc1b59SLidong Chen 
331892370989SLidong Chen static void rdma_cm_poll_handler(void *opaque)
331992370989SLidong Chen {
332092370989SLidong Chen     RDMAContext *rdma = opaque;
332192370989SLidong Chen     int ret;
332292370989SLidong Chen     struct rdma_cm_event *cm_event;
332392370989SLidong Chen     MigrationIncomingState *mis = migration_incoming_get_current();
332492370989SLidong Chen 
332592370989SLidong Chen     ret = rdma_get_cm_event(rdma->channel, &cm_event);
332692370989SLidong Chen     if (ret) {
332792370989SLidong Chen         error_report("get_cm_event failed %d", errno);
332892370989SLidong Chen         return;
332992370989SLidong Chen     }
333092370989SLidong Chen 
333192370989SLidong Chen     if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
333292370989SLidong Chen         cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
3333de8434a3SDr. David Alan Gilbert         if (!rdma->error_state &&
3334de8434a3SDr. David Alan Gilbert             migration_incoming_get_current()->state !=
3335de8434a3SDr. David Alan Gilbert               MIGRATION_STATUS_COMPLETED) {
333692370989SLidong Chen             error_report("receive cm event, cm event is %d", cm_event->event);
333792370989SLidong Chen             rdma->error_state = -EPIPE;
333892370989SLidong Chen             if (rdma->return_path) {
333992370989SLidong Chen                 rdma->return_path->error_state = -EPIPE;
334092370989SLidong Chen             }
3341de8434a3SDr. David Alan Gilbert         }
33426b8c2eb5SLi Zhijian         rdma_ack_cm_event(cm_event);
334392370989SLidong Chen 
334492370989SLidong Chen         if (mis->migration_incoming_co) {
334592370989SLidong Chen             qemu_coroutine_enter(mis->migration_incoming_co);
334692370989SLidong Chen         }
334792370989SLidong Chen         return;
334892370989SLidong Chen     }
33496b8c2eb5SLi Zhijian     rdma_ack_cm_event(cm_event);
335092370989SLidong Chen }
335192370989SLidong Chen 
3352329c9b10SDr. David Alan Gilbert static int qemu_rdma_accept(RDMAContext *rdma)
3353329c9b10SDr. David Alan Gilbert {
3354329c9b10SDr. David Alan Gilbert     RDMACapabilities cap;
3355329c9b10SDr. David Alan Gilbert     struct rdma_conn_param conn_param = {
3356329c9b10SDr. David Alan Gilbert                                             .responder_resources = 2,
3357329c9b10SDr. David Alan Gilbert                                             .private_data = &cap,
3358329c9b10SDr. David Alan Gilbert                                             .private_data_len = sizeof(cap),
3359329c9b10SDr. David Alan Gilbert                                          };
336044bcfd45SLi Zhijian     RDMAContext *rdma_return_path = NULL;
3361329c9b10SDr. David Alan Gilbert     struct rdma_cm_event *cm_event;
3362329c9b10SDr. David Alan Gilbert     struct ibv_context *verbs;
3363329c9b10SDr. David Alan Gilbert     int ret = -EINVAL;
3364329c9b10SDr. David Alan Gilbert     int idx;
3365329c9b10SDr. David Alan Gilbert 
3366329c9b10SDr. David Alan Gilbert     ret = rdma_get_cm_event(rdma->channel, &cm_event);
3367329c9b10SDr. David Alan Gilbert     if (ret) {
3368329c9b10SDr. David Alan Gilbert         goto err_rdma_dest_wait;
3369329c9b10SDr. David Alan Gilbert     }
3370329c9b10SDr. David Alan Gilbert 
3371329c9b10SDr. David Alan Gilbert     if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
3372329c9b10SDr. David Alan Gilbert         rdma_ack_cm_event(cm_event);
3373329c9b10SDr. David Alan Gilbert         goto err_rdma_dest_wait;
3374329c9b10SDr. David Alan Gilbert     }
3375329c9b10SDr. David Alan Gilbert 
337644bcfd45SLi Zhijian     /*
337744bcfd45SLi Zhijian      * initialize the RDMAContext for return path for postcopy after first
337844bcfd45SLi Zhijian      * connection request reached.
337944bcfd45SLi Zhijian      */
338038ad1110SJuan Quintela     if ((migrate_postcopy() || migrate_return_path())
3381a5382214SDr. David Alan Gilbert         && !rdma->is_return_path) {
338244bcfd45SLi Zhijian         rdma_return_path = qemu_rdma_data_init(rdma->host_port, NULL);
338344bcfd45SLi Zhijian         if (rdma_return_path == NULL) {
338444bcfd45SLi Zhijian             rdma_ack_cm_event(cm_event);
338544bcfd45SLi Zhijian             goto err_rdma_dest_wait;
338644bcfd45SLi Zhijian         }
338744bcfd45SLi Zhijian 
338844bcfd45SLi Zhijian         qemu_rdma_return_path_dest_init(rdma_return_path, rdma);
338944bcfd45SLi Zhijian     }
339044bcfd45SLi Zhijian 
3391329c9b10SDr. David Alan Gilbert     memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
3392329c9b10SDr. David Alan Gilbert 
3393329c9b10SDr. David Alan Gilbert     network_to_caps(&cap);
3394329c9b10SDr. David Alan Gilbert 
3395329c9b10SDr. David Alan Gilbert     if (cap.version < 1 || cap.version > RDMA_CONTROL_VERSION_CURRENT) {
3396733252deSDr. David Alan Gilbert             error_report("Unknown source RDMA version: %d, bailing...",
3397329c9b10SDr. David Alan Gilbert                             cap.version);
3398329c9b10SDr. David Alan Gilbert             rdma_ack_cm_event(cm_event);
3399329c9b10SDr. David Alan Gilbert             goto err_rdma_dest_wait;
3400329c9b10SDr. David Alan Gilbert     }
3401329c9b10SDr. David Alan Gilbert 
3402329c9b10SDr. David Alan Gilbert     /*
3403329c9b10SDr. David Alan Gilbert      * Respond with only the capabilities this version of QEMU knows about.
3404329c9b10SDr. David Alan Gilbert      */
3405329c9b10SDr. David Alan Gilbert     cap.flags &= known_capabilities;
3406329c9b10SDr. David Alan Gilbert 
3407329c9b10SDr. David Alan Gilbert     /*
3408329c9b10SDr. David Alan Gilbert      * Enable the ones that we do know about.
3409329c9b10SDr. David Alan Gilbert      * Add other checks here as new ones are introduced.
3410329c9b10SDr. David Alan Gilbert      */
3411329c9b10SDr. David Alan Gilbert     if (cap.flags & RDMA_CAPABILITY_PIN_ALL) {
3412329c9b10SDr. David Alan Gilbert         rdma->pin_all = true;
3413329c9b10SDr. David Alan Gilbert     }
3414329c9b10SDr. David Alan Gilbert 
3415329c9b10SDr. David Alan Gilbert     rdma->cm_id = cm_event->id;
3416329c9b10SDr. David Alan Gilbert     verbs = cm_event->id->verbs;
3417329c9b10SDr. David Alan Gilbert 
3418329c9b10SDr. David Alan Gilbert     rdma_ack_cm_event(cm_event);
3419329c9b10SDr. David Alan Gilbert 
3420733252deSDr. David Alan Gilbert     trace_qemu_rdma_accept_pin_state(rdma->pin_all);
3421329c9b10SDr. David Alan Gilbert 
3422329c9b10SDr. David Alan Gilbert     caps_to_network(&cap);
3423329c9b10SDr. David Alan Gilbert 
3424733252deSDr. David Alan Gilbert     trace_qemu_rdma_accept_pin_verbsc(verbs);
3425329c9b10SDr. David Alan Gilbert 
3426329c9b10SDr. David Alan Gilbert     if (!rdma->verbs) {
3427329c9b10SDr. David Alan Gilbert         rdma->verbs = verbs;
3428329c9b10SDr. David Alan Gilbert     } else if (rdma->verbs != verbs) {
3429733252deSDr. David Alan Gilbert             error_report("ibv context not matching %p, %p!", rdma->verbs,
3430733252deSDr. David Alan Gilbert                          verbs);
3431329c9b10SDr. David Alan Gilbert             goto err_rdma_dest_wait;
3432329c9b10SDr. David Alan Gilbert     }
3433329c9b10SDr. David Alan Gilbert 
3434329c9b10SDr. David Alan Gilbert     qemu_rdma_dump_id("dest_init", verbs);
3435329c9b10SDr. David Alan Gilbert 
3436329c9b10SDr. David Alan Gilbert     ret = qemu_rdma_alloc_pd_cq(rdma);
3437329c9b10SDr. David Alan Gilbert     if (ret) {
3438733252deSDr. David Alan Gilbert         error_report("rdma migration: error allocating pd and cq!");
3439329c9b10SDr. David Alan Gilbert         goto err_rdma_dest_wait;
3440329c9b10SDr. David Alan Gilbert     }
3441329c9b10SDr. David Alan Gilbert 
3442329c9b10SDr. David Alan Gilbert     ret = qemu_rdma_alloc_qp(rdma);
3443329c9b10SDr. David Alan Gilbert     if (ret) {
3444733252deSDr. David Alan Gilbert         error_report("rdma migration: error allocating qp!");
3445329c9b10SDr. David Alan Gilbert         goto err_rdma_dest_wait;
3446329c9b10SDr. David Alan Gilbert     }
3447329c9b10SDr. David Alan Gilbert 
3448329c9b10SDr. David Alan Gilbert     ret = qemu_rdma_init_ram_blocks(rdma);
3449329c9b10SDr. David Alan Gilbert     if (ret) {
3450733252deSDr. David Alan Gilbert         error_report("rdma migration: error initializing ram blocks!");
3451329c9b10SDr. David Alan Gilbert         goto err_rdma_dest_wait;
3452329c9b10SDr. David Alan Gilbert     }
3453329c9b10SDr. David Alan Gilbert 
3454329c9b10SDr. David Alan Gilbert     for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
3455329c9b10SDr. David Alan Gilbert         ret = qemu_rdma_reg_control(rdma, idx);
3456329c9b10SDr. David Alan Gilbert         if (ret) {
3457733252deSDr. David Alan Gilbert             error_report("rdma: error registering %d control", idx);
3458329c9b10SDr. David Alan Gilbert             goto err_rdma_dest_wait;
3459329c9b10SDr. David Alan Gilbert         }
3460329c9b10SDr. David Alan Gilbert     }
3461329c9b10SDr. David Alan Gilbert 
346255cc1b59SLidong Chen     /* Accept the second connection request for return path */
346338ad1110SJuan Quintela     if ((migrate_postcopy() || migrate_return_path())
3464a5382214SDr. David Alan Gilbert         && !rdma->is_return_path) {
346555cc1b59SLidong Chen         qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
346655cc1b59SLidong Chen                             NULL,
346755cc1b59SLidong Chen                             (void *)(intptr_t)rdma->return_path);
346855cc1b59SLidong Chen     } else {
346992370989SLidong Chen         qemu_set_fd_handler(rdma->channel->fd, rdma_cm_poll_handler,
347092370989SLidong Chen                             NULL, rdma);
347155cc1b59SLidong Chen     }
3472329c9b10SDr. David Alan Gilbert 
3473329c9b10SDr. David Alan Gilbert     ret = rdma_accept(rdma->cm_id, &conn_param);
3474329c9b10SDr. David Alan Gilbert     if (ret) {
3475733252deSDr. David Alan Gilbert         error_report("rdma_accept returns %d", ret);
3476329c9b10SDr. David Alan Gilbert         goto err_rdma_dest_wait;
3477329c9b10SDr. David Alan Gilbert     }
3478329c9b10SDr. David Alan Gilbert 
3479329c9b10SDr. David Alan Gilbert     ret = rdma_get_cm_event(rdma->channel, &cm_event);
3480329c9b10SDr. David Alan Gilbert     if (ret) {
3481733252deSDr. David Alan Gilbert         error_report("rdma_accept get_cm_event failed %d", ret);
3482329c9b10SDr. David Alan Gilbert         goto err_rdma_dest_wait;
3483329c9b10SDr. David Alan Gilbert     }
3484329c9b10SDr. David Alan Gilbert 
3485329c9b10SDr. David Alan Gilbert     if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
3486733252deSDr. David Alan Gilbert         error_report("rdma_accept not event established");
3487329c9b10SDr. David Alan Gilbert         rdma_ack_cm_event(cm_event);
3488329c9b10SDr. David Alan Gilbert         goto err_rdma_dest_wait;
3489329c9b10SDr. David Alan Gilbert     }
3490329c9b10SDr. David Alan Gilbert 
3491329c9b10SDr. David Alan Gilbert     rdma_ack_cm_event(cm_event);
3492329c9b10SDr. David Alan Gilbert     rdma->connected = true;
3493329c9b10SDr. David Alan Gilbert 
3494329c9b10SDr. David Alan Gilbert     ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
3495329c9b10SDr. David Alan Gilbert     if (ret) {
3496733252deSDr. David Alan Gilbert         error_report("rdma migration: error posting second control recv");
3497329c9b10SDr. David Alan Gilbert         goto err_rdma_dest_wait;
3498329c9b10SDr. David Alan Gilbert     }
3499329c9b10SDr. David Alan Gilbert 
3500329c9b10SDr. David Alan Gilbert     qemu_rdma_dump_gid("dest_connect", rdma->cm_id);
3501329c9b10SDr. David Alan Gilbert 
3502329c9b10SDr. David Alan Gilbert     return 0;
3503329c9b10SDr. David Alan Gilbert 
3504329c9b10SDr. David Alan Gilbert err_rdma_dest_wait:
3505329c9b10SDr. David Alan Gilbert     rdma->error_state = ret;
3506329c9b10SDr. David Alan Gilbert     qemu_rdma_cleanup(rdma);
350744bcfd45SLi Zhijian     g_free(rdma_return_path);
3508329c9b10SDr. David Alan Gilbert     return ret;
3509329c9b10SDr. David Alan Gilbert }
3510329c9b10SDr. David Alan Gilbert 
3511e4d63320SDr. David Alan Gilbert static int dest_ram_sort_func(const void *a, const void *b)
3512e4d63320SDr. David Alan Gilbert {
3513e4d63320SDr. David Alan Gilbert     unsigned int a_index = ((const RDMALocalBlock *)a)->src_index;
3514e4d63320SDr. David Alan Gilbert     unsigned int b_index = ((const RDMALocalBlock *)b)->src_index;
3515e4d63320SDr. David Alan Gilbert 
3516e4d63320SDr. David Alan Gilbert     return (a_index < b_index) ? -1 : (a_index != b_index);
3517e4d63320SDr. David Alan Gilbert }
3518e4d63320SDr. David Alan Gilbert 
3519329c9b10SDr. David Alan Gilbert /*
3520329c9b10SDr. David Alan Gilbert  * During each iteration of the migration, we listen for instructions
3521329c9b10SDr. David Alan Gilbert  * by the source VM to perform dynamic page registrations before they
3522329c9b10SDr. David Alan Gilbert  * can perform RDMA operations.
3523329c9b10SDr. David Alan Gilbert  *
3524329c9b10SDr. David Alan Gilbert  * We respond with the 'rkey'.
3525329c9b10SDr. David Alan Gilbert  *
3526329c9b10SDr. David Alan Gilbert  * Keep doing this until the source tells us to stop.
3527329c9b10SDr. David Alan Gilbert  */
3528632e3a5cSDr. David Alan Gilbert static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque)
3529329c9b10SDr. David Alan Gilbert {
3530329c9b10SDr. David Alan Gilbert     RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult),
3531329c9b10SDr. David Alan Gilbert                                .type = RDMA_CONTROL_REGISTER_RESULT,
3532329c9b10SDr. David Alan Gilbert                                .repeat = 0,
3533329c9b10SDr. David Alan Gilbert                              };
3534329c9b10SDr. David Alan Gilbert     RDMAControlHeader unreg_resp = { .len = 0,
3535329c9b10SDr. David Alan Gilbert                                .type = RDMA_CONTROL_UNREGISTER_FINISHED,
3536329c9b10SDr. David Alan Gilbert                                .repeat = 0,
3537329c9b10SDr. David Alan Gilbert                              };
3538329c9b10SDr. David Alan Gilbert     RDMAControlHeader blocks = { .type = RDMA_CONTROL_RAM_BLOCKS_RESULT,
3539329c9b10SDr. David Alan Gilbert                                  .repeat = 1 };
35406ddd2d76SDaniel P. Berrange     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
354174637e6fSLidong Chen     RDMAContext *rdma;
354274637e6fSLidong Chen     RDMALocalBlocks *local;
3543329c9b10SDr. David Alan Gilbert     RDMAControlHeader head;
3544329c9b10SDr. David Alan Gilbert     RDMARegister *reg, *registers;
3545329c9b10SDr. David Alan Gilbert     RDMACompress *comp;
3546329c9b10SDr. David Alan Gilbert     RDMARegisterResult *reg_result;
3547329c9b10SDr. David Alan Gilbert     static RDMARegisterResult results[RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE];
3548329c9b10SDr. David Alan Gilbert     RDMALocalBlock *block;
3549329c9b10SDr. David Alan Gilbert     void *host_addr;
3550329c9b10SDr. David Alan Gilbert     int ret = 0;
3551329c9b10SDr. David Alan Gilbert     int idx = 0;
3552329c9b10SDr. David Alan Gilbert     int count = 0;
3553329c9b10SDr. David Alan Gilbert     int i = 0;
3554329c9b10SDr. David Alan Gilbert 
3555987ab2a5SDr. David Alan Gilbert     RCU_READ_LOCK_GUARD();
3556d73415a3SStefan Hajnoczi     rdma = qatomic_rcu_read(&rioc->rdmain);
355774637e6fSLidong Chen 
355874637e6fSLidong Chen     if (!rdma) {
355974637e6fSLidong Chen         return -EIO;
356074637e6fSLidong Chen     }
356174637e6fSLidong Chen 
3562329c9b10SDr. David Alan Gilbert     CHECK_ERROR_STATE();
3563329c9b10SDr. David Alan Gilbert 
356474637e6fSLidong Chen     local = &rdma->local_ram_blocks;
3565329c9b10SDr. David Alan Gilbert     do {
3566632e3a5cSDr. David Alan Gilbert         trace_qemu_rdma_registration_handle_wait();
3567329c9b10SDr. David Alan Gilbert 
3568329c9b10SDr. David Alan Gilbert         ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_NONE);
3569329c9b10SDr. David Alan Gilbert 
3570329c9b10SDr. David Alan Gilbert         if (ret < 0) {
3571329c9b10SDr. David Alan Gilbert             break;
3572329c9b10SDr. David Alan Gilbert         }
3573329c9b10SDr. David Alan Gilbert 
3574329c9b10SDr. David Alan Gilbert         if (head.repeat > RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE) {
3575733252deSDr. David Alan Gilbert             error_report("rdma: Too many requests in this message (%d)."
3576733252deSDr. David Alan Gilbert                             "Bailing.", head.repeat);
3577329c9b10SDr. David Alan Gilbert             ret = -EIO;
3578329c9b10SDr. David Alan Gilbert             break;
3579329c9b10SDr. David Alan Gilbert         }
3580329c9b10SDr. David Alan Gilbert 
3581329c9b10SDr. David Alan Gilbert         switch (head.type) {
3582329c9b10SDr. David Alan Gilbert         case RDMA_CONTROL_COMPRESS:
3583329c9b10SDr. David Alan Gilbert             comp = (RDMACompress *) rdma->wr_data[idx].control_curr;
3584329c9b10SDr. David Alan Gilbert             network_to_compress(comp);
3585329c9b10SDr. David Alan Gilbert 
3586733252deSDr. David Alan Gilbert             trace_qemu_rdma_registration_handle_compress(comp->length,
3587733252deSDr. David Alan Gilbert                                                          comp->block_idx,
3588733252deSDr. David Alan Gilbert                                                          comp->offset);
3589afcddefdSDr. David Alan Gilbert             if (comp->block_idx >= rdma->local_ram_blocks.nb_blocks) {
3590afcddefdSDr. David Alan Gilbert                 error_report("rdma: 'compress' bad block index %u (vs %d)",
3591afcddefdSDr. David Alan Gilbert                              (unsigned int)comp->block_idx,
3592afcddefdSDr. David Alan Gilbert                              rdma->local_ram_blocks.nb_blocks);
3593afcddefdSDr. David Alan Gilbert                 ret = -EIO;
359424b41d66SDr. David Alan Gilbert                 goto out;
3595afcddefdSDr. David Alan Gilbert             }
3596329c9b10SDr. David Alan Gilbert             block = &(rdma->local_ram_blocks.block[comp->block_idx]);
3597329c9b10SDr. David Alan Gilbert 
3598329c9b10SDr. David Alan Gilbert             host_addr = block->local_host_addr +
3599329c9b10SDr. David Alan Gilbert                             (comp->offset - block->offset);
3600329c9b10SDr. David Alan Gilbert 
3601329c9b10SDr. David Alan Gilbert             ram_handle_compressed(host_addr, comp->value, comp->length);
3602329c9b10SDr. David Alan Gilbert             break;
3603329c9b10SDr. David Alan Gilbert 
3604329c9b10SDr. David Alan Gilbert         case RDMA_CONTROL_REGISTER_FINISHED:
3605733252deSDr. David Alan Gilbert             trace_qemu_rdma_registration_handle_finished();
3606329c9b10SDr. David Alan Gilbert             goto out;
3607329c9b10SDr. David Alan Gilbert 
3608329c9b10SDr. David Alan Gilbert         case RDMA_CONTROL_RAM_BLOCKS_REQUEST:
3609733252deSDr. David Alan Gilbert             trace_qemu_rdma_registration_handle_ram_blocks();
3610329c9b10SDr. David Alan Gilbert 
3611e4d63320SDr. David Alan Gilbert             /* Sort our local RAM Block list so it's the same as the source,
3612e4d63320SDr. David Alan Gilbert              * we can do this since we've filled in a src_index in the list
3613e4d63320SDr. David Alan Gilbert              * as we received the RAMBlock list earlier.
3614e4d63320SDr. David Alan Gilbert              */
3615e4d63320SDr. David Alan Gilbert             qsort(rdma->local_ram_blocks.block,
3616e4d63320SDr. David Alan Gilbert                   rdma->local_ram_blocks.nb_blocks,
3617e4d63320SDr. David Alan Gilbert                   sizeof(RDMALocalBlock), dest_ram_sort_func);
361871cd7306SLidong Chen             for (i = 0; i < local->nb_blocks; i++) {
361971cd7306SLidong Chen                 local->block[i].index = i;
362071cd7306SLidong Chen             }
362171cd7306SLidong Chen 
3622329c9b10SDr. David Alan Gilbert             if (rdma->pin_all) {
3623329c9b10SDr. David Alan Gilbert                 ret = qemu_rdma_reg_whole_ram_blocks(rdma);
3624329c9b10SDr. David Alan Gilbert                 if (ret) {
3625733252deSDr. David Alan Gilbert                     error_report("rdma migration: error dest "
3626733252deSDr. David Alan Gilbert                                     "registering ram blocks");
3627329c9b10SDr. David Alan Gilbert                     goto out;
3628329c9b10SDr. David Alan Gilbert                 }
3629329c9b10SDr. David Alan Gilbert             }
3630329c9b10SDr. David Alan Gilbert 
3631329c9b10SDr. David Alan Gilbert             /*
3632329c9b10SDr. David Alan Gilbert              * Dest uses this to prepare to transmit the RAMBlock descriptions
3633329c9b10SDr. David Alan Gilbert              * to the source VM after connection setup.
3634329c9b10SDr. David Alan Gilbert              * Both sides use the "remote" structure to communicate and update
3635329c9b10SDr. David Alan Gilbert              * their "local" descriptions with what was sent.
3636329c9b10SDr. David Alan Gilbert              */
3637329c9b10SDr. David Alan Gilbert             for (i = 0; i < local->nb_blocks; i++) {
3638a97270adSDr. David Alan Gilbert                 rdma->dest_blocks[i].remote_host_addr =
3639fbce8c25SStefan Weil                     (uintptr_t)(local->block[i].local_host_addr);
3640329c9b10SDr. David Alan Gilbert 
3641329c9b10SDr. David Alan Gilbert                 if (rdma->pin_all) {
3642a97270adSDr. David Alan Gilbert                     rdma->dest_blocks[i].remote_rkey = local->block[i].mr->rkey;
3643329c9b10SDr. David Alan Gilbert                 }
3644329c9b10SDr. David Alan Gilbert 
3645a97270adSDr. David Alan Gilbert                 rdma->dest_blocks[i].offset = local->block[i].offset;
3646a97270adSDr. David Alan Gilbert                 rdma->dest_blocks[i].length = local->block[i].length;
3647329c9b10SDr. David Alan Gilbert 
3648a97270adSDr. David Alan Gilbert                 dest_block_to_network(&rdma->dest_blocks[i]);
3649e4d63320SDr. David Alan Gilbert                 trace_qemu_rdma_registration_handle_ram_blocks_loop(
3650e4d63320SDr. David Alan Gilbert                     local->block[i].block_name,
3651e4d63320SDr. David Alan Gilbert                     local->block[i].offset,
3652e4d63320SDr. David Alan Gilbert                     local->block[i].length,
3653e4d63320SDr. David Alan Gilbert                     local->block[i].local_host_addr,
3654e4d63320SDr. David Alan Gilbert                     local->block[i].src_index);
3655329c9b10SDr. David Alan Gilbert             }
3656329c9b10SDr. David Alan Gilbert 
3657329c9b10SDr. David Alan Gilbert             blocks.len = rdma->local_ram_blocks.nb_blocks
3658a97270adSDr. David Alan Gilbert                                                 * sizeof(RDMADestBlock);
3659329c9b10SDr. David Alan Gilbert 
3660329c9b10SDr. David Alan Gilbert 
3661329c9b10SDr. David Alan Gilbert             ret = qemu_rdma_post_send_control(rdma,
3662a97270adSDr. David Alan Gilbert                                         (uint8_t *) rdma->dest_blocks, &blocks);
3663329c9b10SDr. David Alan Gilbert 
3664329c9b10SDr. David Alan Gilbert             if (ret < 0) {
3665733252deSDr. David Alan Gilbert                 error_report("rdma migration: error sending remote info");
3666329c9b10SDr. David Alan Gilbert                 goto out;
3667329c9b10SDr. David Alan Gilbert             }
3668329c9b10SDr. David Alan Gilbert 
3669329c9b10SDr. David Alan Gilbert             break;
3670329c9b10SDr. David Alan Gilbert         case RDMA_CONTROL_REGISTER_REQUEST:
3671733252deSDr. David Alan Gilbert             trace_qemu_rdma_registration_handle_register(head.repeat);
3672329c9b10SDr. David Alan Gilbert 
3673329c9b10SDr. David Alan Gilbert             reg_resp.repeat = head.repeat;
3674329c9b10SDr. David Alan Gilbert             registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
3675329c9b10SDr. David Alan Gilbert 
3676329c9b10SDr. David Alan Gilbert             for (count = 0; count < head.repeat; count++) {
3677329c9b10SDr. David Alan Gilbert                 uint64_t chunk;
3678329c9b10SDr. David Alan Gilbert                 uint8_t *chunk_start, *chunk_end;
3679329c9b10SDr. David Alan Gilbert 
3680329c9b10SDr. David Alan Gilbert                 reg = &registers[count];
3681329c9b10SDr. David Alan Gilbert                 network_to_register(reg);
3682329c9b10SDr. David Alan Gilbert 
3683329c9b10SDr. David Alan Gilbert                 reg_result = &results[count];
3684329c9b10SDr. David Alan Gilbert 
3685733252deSDr. David Alan Gilbert                 trace_qemu_rdma_registration_handle_register_loop(count,
3686329c9b10SDr. David Alan Gilbert                          reg->current_index, reg->key.current_addr, reg->chunks);
3687329c9b10SDr. David Alan Gilbert 
3688afcddefdSDr. David Alan Gilbert                 if (reg->current_index >= rdma->local_ram_blocks.nb_blocks) {
3689afcddefdSDr. David Alan Gilbert                     error_report("rdma: 'register' bad block index %u (vs %d)",
3690afcddefdSDr. David Alan Gilbert                                  (unsigned int)reg->current_index,
3691afcddefdSDr. David Alan Gilbert                                  rdma->local_ram_blocks.nb_blocks);
3692afcddefdSDr. David Alan Gilbert                     ret = -ENOENT;
369324b41d66SDr. David Alan Gilbert                     goto out;
3694afcddefdSDr. David Alan Gilbert                 }
3695329c9b10SDr. David Alan Gilbert                 block = &(rdma->local_ram_blocks.block[reg->current_index]);
3696329c9b10SDr. David Alan Gilbert                 if (block->is_ram_block) {
3697afcddefdSDr. David Alan Gilbert                     if (block->offset > reg->key.current_addr) {
3698afcddefdSDr. David Alan Gilbert                         error_report("rdma: bad register address for block %s"
3699afcddefdSDr. David Alan Gilbert                             " offset: %" PRIx64 " current_addr: %" PRIx64,
3700afcddefdSDr. David Alan Gilbert                             block->block_name, block->offset,
3701afcddefdSDr. David Alan Gilbert                             reg->key.current_addr);
3702afcddefdSDr. David Alan Gilbert                         ret = -ERANGE;
370324b41d66SDr. David Alan Gilbert                         goto out;
3704afcddefdSDr. David Alan Gilbert                     }
3705329c9b10SDr. David Alan Gilbert                     host_addr = (block->local_host_addr +
3706329c9b10SDr. David Alan Gilbert                                 (reg->key.current_addr - block->offset));
3707329c9b10SDr. David Alan Gilbert                     chunk = ram_chunk_index(block->local_host_addr,
3708329c9b10SDr. David Alan Gilbert                                             (uint8_t *) host_addr);
3709329c9b10SDr. David Alan Gilbert                 } else {
3710329c9b10SDr. David Alan Gilbert                     chunk = reg->key.chunk;
3711329c9b10SDr. David Alan Gilbert                     host_addr = block->local_host_addr +
3712329c9b10SDr. David Alan Gilbert                         (reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT));
3713afcddefdSDr. David Alan Gilbert                     /* Check for particularly bad chunk value */
3714afcddefdSDr. David Alan Gilbert                     if (host_addr < (void *)block->local_host_addr) {
3715afcddefdSDr. David Alan Gilbert                         error_report("rdma: bad chunk for block %s"
3716afcddefdSDr. David Alan Gilbert                             " chunk: %" PRIx64,
3717afcddefdSDr. David Alan Gilbert                             block->block_name, reg->key.chunk);
3718afcddefdSDr. David Alan Gilbert                         ret = -ERANGE;
371924b41d66SDr. David Alan Gilbert                         goto out;
3720afcddefdSDr. David Alan Gilbert                     }
3721329c9b10SDr. David Alan Gilbert                 }
3722329c9b10SDr. David Alan Gilbert                 chunk_start = ram_chunk_start(block, chunk);
3723329c9b10SDr. David Alan Gilbert                 chunk_end = ram_chunk_end(block, chunk + reg->chunks);
37249589e763SMarcel Apfelbaum                 /* avoid "-Waddress-of-packed-member" warning */
37259589e763SMarcel Apfelbaum                 uint32_t tmp_rkey = 0;
3726329c9b10SDr. David Alan Gilbert                 if (qemu_rdma_register_and_get_keys(rdma, block,
37279589e763SMarcel Apfelbaum                             (uintptr_t)host_addr, NULL, &tmp_rkey,
3728329c9b10SDr. David Alan Gilbert                             chunk, chunk_start, chunk_end)) {
3729733252deSDr. David Alan Gilbert                     error_report("cannot get rkey");
3730329c9b10SDr. David Alan Gilbert                     ret = -EINVAL;
3731329c9b10SDr. David Alan Gilbert                     goto out;
3732329c9b10SDr. David Alan Gilbert                 }
37339589e763SMarcel Apfelbaum                 reg_result->rkey = tmp_rkey;
3734329c9b10SDr. David Alan Gilbert 
3735fbce8c25SStefan Weil                 reg_result->host_addr = (uintptr_t)block->local_host_addr;
3736329c9b10SDr. David Alan Gilbert 
3737733252deSDr. David Alan Gilbert                 trace_qemu_rdma_registration_handle_register_rkey(
3738329c9b10SDr. David Alan Gilbert                                                            reg_result->rkey);
3739329c9b10SDr. David Alan Gilbert 
3740329c9b10SDr. David Alan Gilbert                 result_to_network(reg_result);
3741329c9b10SDr. David Alan Gilbert             }
3742329c9b10SDr. David Alan Gilbert 
3743329c9b10SDr. David Alan Gilbert             ret = qemu_rdma_post_send_control(rdma,
3744329c9b10SDr. David Alan Gilbert                             (uint8_t *) results, &reg_resp);
3745329c9b10SDr. David Alan Gilbert 
3746329c9b10SDr. David Alan Gilbert             if (ret < 0) {
3747733252deSDr. David Alan Gilbert                 error_report("Failed to send control buffer");
3748329c9b10SDr. David Alan Gilbert                 goto out;
3749329c9b10SDr. David Alan Gilbert             }
3750329c9b10SDr. David Alan Gilbert             break;
3751329c9b10SDr. David Alan Gilbert         case RDMA_CONTROL_UNREGISTER_REQUEST:
3752733252deSDr. David Alan Gilbert             trace_qemu_rdma_registration_handle_unregister(head.repeat);
3753329c9b10SDr. David Alan Gilbert             unreg_resp.repeat = head.repeat;
3754329c9b10SDr. David Alan Gilbert             registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
3755329c9b10SDr. David Alan Gilbert 
3756329c9b10SDr. David Alan Gilbert             for (count = 0; count < head.repeat; count++) {
3757329c9b10SDr. David Alan Gilbert                 reg = &registers[count];
3758329c9b10SDr. David Alan Gilbert                 network_to_register(reg);
3759329c9b10SDr. David Alan Gilbert 
3760733252deSDr. David Alan Gilbert                 trace_qemu_rdma_registration_handle_unregister_loop(count,
3761733252deSDr. David Alan Gilbert                            reg->current_index, reg->key.chunk);
3762329c9b10SDr. David Alan Gilbert 
3763329c9b10SDr. David Alan Gilbert                 block = &(rdma->local_ram_blocks.block[reg->current_index]);
3764329c9b10SDr. David Alan Gilbert 
3765329c9b10SDr. David Alan Gilbert                 ret = ibv_dereg_mr(block->pmr[reg->key.chunk]);
3766329c9b10SDr. David Alan Gilbert                 block->pmr[reg->key.chunk] = NULL;
3767329c9b10SDr. David Alan Gilbert 
3768329c9b10SDr. David Alan Gilbert                 if (ret != 0) {
3769329c9b10SDr. David Alan Gilbert                     perror("rdma unregistration chunk failed");
3770329c9b10SDr. David Alan Gilbert                     ret = -ret;
3771329c9b10SDr. David Alan Gilbert                     goto out;
3772329c9b10SDr. David Alan Gilbert                 }
3773329c9b10SDr. David Alan Gilbert 
3774329c9b10SDr. David Alan Gilbert                 rdma->total_registrations--;
3775329c9b10SDr. David Alan Gilbert 
3776733252deSDr. David Alan Gilbert                 trace_qemu_rdma_registration_handle_unregister_success(
3777329c9b10SDr. David Alan Gilbert                                                        reg->key.chunk);
3778329c9b10SDr. David Alan Gilbert             }
3779329c9b10SDr. David Alan Gilbert 
3780329c9b10SDr. David Alan Gilbert             ret = qemu_rdma_post_send_control(rdma, NULL, &unreg_resp);
3781329c9b10SDr. David Alan Gilbert 
3782329c9b10SDr. David Alan Gilbert             if (ret < 0) {
3783733252deSDr. David Alan Gilbert                 error_report("Failed to send control buffer");
3784329c9b10SDr. David Alan Gilbert                 goto out;
3785329c9b10SDr. David Alan Gilbert             }
3786329c9b10SDr. David Alan Gilbert             break;
3787329c9b10SDr. David Alan Gilbert         case RDMA_CONTROL_REGISTER_RESULT:
3788733252deSDr. David Alan Gilbert             error_report("Invalid RESULT message at dest.");
3789329c9b10SDr. David Alan Gilbert             ret = -EIO;
3790329c9b10SDr. David Alan Gilbert             goto out;
3791329c9b10SDr. David Alan Gilbert         default:
3792482a33c5SDr. David Alan Gilbert             error_report("Unknown control message %s", control_desc(head.type));
3793329c9b10SDr. David Alan Gilbert             ret = -EIO;
3794329c9b10SDr. David Alan Gilbert             goto out;
3795329c9b10SDr. David Alan Gilbert         }
3796329c9b10SDr. David Alan Gilbert     } while (1);
3797329c9b10SDr. David Alan Gilbert out:
3798329c9b10SDr. David Alan Gilbert     if (ret < 0) {
3799329c9b10SDr. David Alan Gilbert         rdma->error_state = ret;
3800329c9b10SDr. David Alan Gilbert     }
3801329c9b10SDr. David Alan Gilbert     return ret;
3802329c9b10SDr. David Alan Gilbert }
3803329c9b10SDr. David Alan Gilbert 
3804e4d63320SDr. David Alan Gilbert /* Destination:
3805e4d63320SDr. David Alan Gilbert  * Called via a ram_control_load_hook during the initial RAM load section which
3806e4d63320SDr. David Alan Gilbert  * lists the RAMBlocks by name.  This lets us know the order of the RAMBlocks
3807e4d63320SDr. David Alan Gilbert  * on the source.
3808e4d63320SDr. David Alan Gilbert  * We've already built our local RAMBlock list, but not yet sent the list to
3809e4d63320SDr. David Alan Gilbert  * the source.
3810e4d63320SDr. David Alan Gilbert  */
38116ddd2d76SDaniel P. Berrange static int
38126ddd2d76SDaniel P. Berrange rdma_block_notification_handle(QIOChannelRDMA *rioc, const char *name)
3813e4d63320SDr. David Alan Gilbert {
381474637e6fSLidong Chen     RDMAContext *rdma;
3815e4d63320SDr. David Alan Gilbert     int curr;
3816e4d63320SDr. David Alan Gilbert     int found = -1;
3817e4d63320SDr. David Alan Gilbert 
3818987ab2a5SDr. David Alan Gilbert     RCU_READ_LOCK_GUARD();
3819d73415a3SStefan Hajnoczi     rdma = qatomic_rcu_read(&rioc->rdmain);
382074637e6fSLidong Chen 
382174637e6fSLidong Chen     if (!rdma) {
382274637e6fSLidong Chen         return -EIO;
382374637e6fSLidong Chen     }
382474637e6fSLidong Chen 
3825e4d63320SDr. David Alan Gilbert     /* Find the matching RAMBlock in our local list */
3826e4d63320SDr. David Alan Gilbert     for (curr = 0; curr < rdma->local_ram_blocks.nb_blocks; curr++) {
3827e4d63320SDr. David Alan Gilbert         if (!strcmp(rdma->local_ram_blocks.block[curr].block_name, name)) {
3828e4d63320SDr. David Alan Gilbert             found = curr;
3829e4d63320SDr. David Alan Gilbert             break;
3830e4d63320SDr. David Alan Gilbert         }
3831e4d63320SDr. David Alan Gilbert     }
3832e4d63320SDr. David Alan Gilbert 
3833e4d63320SDr. David Alan Gilbert     if (found == -1) {
3834e4d63320SDr. David Alan Gilbert         error_report("RAMBlock '%s' not found on destination", name);
3835e4d63320SDr. David Alan Gilbert         return -ENOENT;
3836e4d63320SDr. David Alan Gilbert     }
3837e4d63320SDr. David Alan Gilbert 
3838e4d63320SDr. David Alan Gilbert     rdma->local_ram_blocks.block[curr].src_index = rdma->next_src_index;
3839e4d63320SDr. David Alan Gilbert     trace_rdma_block_notification_handle(name, rdma->next_src_index);
3840e4d63320SDr. David Alan Gilbert     rdma->next_src_index++;
3841e4d63320SDr. David Alan Gilbert 
3842e4d63320SDr. David Alan Gilbert     return 0;
3843e4d63320SDr. David Alan Gilbert }
3844e4d63320SDr. David Alan Gilbert 
3845365c0463SDaniel P. Berrangé static int rdma_load_hook(QEMUFile *f, uint64_t flags, void *data)
3846632e3a5cSDr. David Alan Gilbert {
3847365c0463SDaniel P. Berrangé     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
3848632e3a5cSDr. David Alan Gilbert     switch (flags) {
3849632e3a5cSDr. David Alan Gilbert     case RAM_CONTROL_BLOCK_REG:
3850365c0463SDaniel P. Berrangé         return rdma_block_notification_handle(rioc, data);
3851632e3a5cSDr. David Alan Gilbert 
3852632e3a5cSDr. David Alan Gilbert     case RAM_CONTROL_HOOK:
3853365c0463SDaniel P. Berrangé         return qemu_rdma_registration_handle(f, rioc);
3854632e3a5cSDr. David Alan Gilbert 
3855632e3a5cSDr. David Alan Gilbert     default:
3856632e3a5cSDr. David Alan Gilbert         /* Shouldn't be called with any other values */
3857632e3a5cSDr. David Alan Gilbert         abort();
3858632e3a5cSDr. David Alan Gilbert     }
3859632e3a5cSDr. David Alan Gilbert }
3860632e3a5cSDr. David Alan Gilbert 
3861365c0463SDaniel P. Berrangé static int qemu_rdma_registration_start(QEMUFile *f,
3862632e3a5cSDr. David Alan Gilbert                                         uint64_t flags, void *data)
3863329c9b10SDr. David Alan Gilbert {
3864365c0463SDaniel P. Berrangé     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
386574637e6fSLidong Chen     RDMAContext *rdma;
386674637e6fSLidong Chen 
3867987ab2a5SDr. David Alan Gilbert     RCU_READ_LOCK_GUARD();
3868d73415a3SStefan Hajnoczi     rdma = qatomic_rcu_read(&rioc->rdmaout);
386974637e6fSLidong Chen     if (!rdma) {
387074637e6fSLidong Chen         return -EIO;
387174637e6fSLidong Chen     }
3872329c9b10SDr. David Alan Gilbert 
3873329c9b10SDr. David Alan Gilbert     CHECK_ERROR_STATE();
3874329c9b10SDr. David Alan Gilbert 
38756a88eb2bSWei Yang     if (migration_in_postcopy()) {
3876ccb7e1b5SLidong Chen         return 0;
3877ccb7e1b5SLidong Chen     }
3878ccb7e1b5SLidong Chen 
3879733252deSDr. David Alan Gilbert     trace_qemu_rdma_registration_start(flags);
3880329c9b10SDr. David Alan Gilbert     qemu_put_be64(f, RAM_SAVE_FLAG_HOOK);
3881329c9b10SDr. David Alan Gilbert     qemu_fflush(f);
3882329c9b10SDr. David Alan Gilbert 
3883329c9b10SDr. David Alan Gilbert     return 0;
3884329c9b10SDr. David Alan Gilbert }
3885329c9b10SDr. David Alan Gilbert 
3886329c9b10SDr. David Alan Gilbert /*
3887329c9b10SDr. David Alan Gilbert  * Inform dest that dynamic registrations are done for now.
3888329c9b10SDr. David Alan Gilbert  * First, flush writes, if any.
3889329c9b10SDr. David Alan Gilbert  */
3890365c0463SDaniel P. Berrangé static int qemu_rdma_registration_stop(QEMUFile *f,
3891632e3a5cSDr. David Alan Gilbert                                        uint64_t flags, void *data)
3892329c9b10SDr. David Alan Gilbert {
3893365c0463SDaniel P. Berrangé     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
389474637e6fSLidong Chen     RDMAContext *rdma;
3895329c9b10SDr. David Alan Gilbert     RDMAControlHeader head = { .len = 0, .repeat = 1 };
3896329c9b10SDr. David Alan Gilbert     int ret = 0;
3897329c9b10SDr. David Alan Gilbert 
3898987ab2a5SDr. David Alan Gilbert     RCU_READ_LOCK_GUARD();
3899d73415a3SStefan Hajnoczi     rdma = qatomic_rcu_read(&rioc->rdmaout);
390074637e6fSLidong Chen     if (!rdma) {
390174637e6fSLidong Chen         return -EIO;
390274637e6fSLidong Chen     }
390374637e6fSLidong Chen 
3904329c9b10SDr. David Alan Gilbert     CHECK_ERROR_STATE();
3905329c9b10SDr. David Alan Gilbert 
39066a88eb2bSWei Yang     if (migration_in_postcopy()) {
3907ccb7e1b5SLidong Chen         return 0;
3908ccb7e1b5SLidong Chen     }
3909ccb7e1b5SLidong Chen 
3910329c9b10SDr. David Alan Gilbert     qemu_fflush(f);
3911329c9b10SDr. David Alan Gilbert     ret = qemu_rdma_drain_cq(f, rdma);
3912329c9b10SDr. David Alan Gilbert 
3913329c9b10SDr. David Alan Gilbert     if (ret < 0) {
3914329c9b10SDr. David Alan Gilbert         goto err;
3915329c9b10SDr. David Alan Gilbert     }
3916329c9b10SDr. David Alan Gilbert 
3917329c9b10SDr. David Alan Gilbert     if (flags == RAM_CONTROL_SETUP) {
3918329c9b10SDr. David Alan Gilbert         RDMAControlHeader resp = {.type = RDMA_CONTROL_RAM_BLOCKS_RESULT };
3919329c9b10SDr. David Alan Gilbert         RDMALocalBlocks *local = &rdma->local_ram_blocks;
3920e4d63320SDr. David Alan Gilbert         int reg_result_idx, i, nb_dest_blocks;
3921329c9b10SDr. David Alan Gilbert 
3922329c9b10SDr. David Alan Gilbert         head.type = RDMA_CONTROL_RAM_BLOCKS_REQUEST;
3923733252deSDr. David Alan Gilbert         trace_qemu_rdma_registration_stop_ram();
3924329c9b10SDr. David Alan Gilbert 
3925329c9b10SDr. David Alan Gilbert         /*
3926329c9b10SDr. David Alan Gilbert          * Make sure that we parallelize the pinning on both sides.
3927329c9b10SDr. David Alan Gilbert          * For very large guests, doing this serially takes a really
3928329c9b10SDr. David Alan Gilbert          * long time, so we have to 'interleave' the pinning locally
3929329c9b10SDr. David Alan Gilbert          * with the control messages by performing the pinning on this
3930329c9b10SDr. David Alan Gilbert          * side before we receive the control response from the other
3931329c9b10SDr. David Alan Gilbert          * side that the pinning has completed.
3932329c9b10SDr. David Alan Gilbert          */
3933329c9b10SDr. David Alan Gilbert         ret = qemu_rdma_exchange_send(rdma, &head, NULL, &resp,
3934329c9b10SDr. David Alan Gilbert                     &reg_result_idx, rdma->pin_all ?
3935329c9b10SDr. David Alan Gilbert                     qemu_rdma_reg_whole_ram_blocks : NULL);
3936329c9b10SDr. David Alan Gilbert         if (ret < 0) {
39379cde9caaSMarkus Armbruster             fprintf(stderr, "receiving remote info!");
3938329c9b10SDr. David Alan Gilbert             return ret;
3939329c9b10SDr. David Alan Gilbert         }
3940329c9b10SDr. David Alan Gilbert 
3941a97270adSDr. David Alan Gilbert         nb_dest_blocks = resp.len / sizeof(RDMADestBlock);
3942329c9b10SDr. David Alan Gilbert 
3943329c9b10SDr. David Alan Gilbert         /*
3944329c9b10SDr. David Alan Gilbert          * The protocol uses two different sets of rkeys (mutually exclusive):
3945329c9b10SDr. David Alan Gilbert          * 1. One key to represent the virtual address of the entire ram block.
3946329c9b10SDr. David Alan Gilbert          *    (dynamic chunk registration disabled - pin everything with one rkey.)
3947329c9b10SDr. David Alan Gilbert          * 2. One to represent individual chunks within a ram block.
3948329c9b10SDr. David Alan Gilbert          *    (dynamic chunk registration enabled - pin individual chunks.)
3949329c9b10SDr. David Alan Gilbert          *
3950329c9b10SDr. David Alan Gilbert          * Once the capability is successfully negotiated, the destination transmits
3951329c9b10SDr. David Alan Gilbert          * the keys to use (or sends them later) including the virtual addresses
3952329c9b10SDr. David Alan Gilbert          * and then propagates the remote ram block descriptions to his local copy.
3953329c9b10SDr. David Alan Gilbert          */
3954329c9b10SDr. David Alan Gilbert 
3955a97270adSDr. David Alan Gilbert         if (local->nb_blocks != nb_dest_blocks) {
39569cde9caaSMarkus Armbruster             fprintf(stderr, "ram blocks mismatch (Number of blocks %d vs %d) "
3957329c9b10SDr. David Alan Gilbert                     "Your QEMU command line parameters are probably "
3958e4d63320SDr. David Alan Gilbert                     "not identical on both the source and destination.",
3959e4d63320SDr. David Alan Gilbert                     local->nb_blocks, nb_dest_blocks);
3960ef4b722dSDr. David Alan Gilbert             rdma->error_state = -EINVAL;
3961329c9b10SDr. David Alan Gilbert             return -EINVAL;
3962329c9b10SDr. David Alan Gilbert         }
3963329c9b10SDr. David Alan Gilbert 
3964329c9b10SDr. David Alan Gilbert         qemu_rdma_move_header(rdma, reg_result_idx, &resp);
3965a97270adSDr. David Alan Gilbert         memcpy(rdma->dest_blocks,
3966329c9b10SDr. David Alan Gilbert             rdma->wr_data[reg_result_idx].control_curr, resp.len);
3967a97270adSDr. David Alan Gilbert         for (i = 0; i < nb_dest_blocks; i++) {
3968a97270adSDr. David Alan Gilbert             network_to_dest_block(&rdma->dest_blocks[i]);
3969329c9b10SDr. David Alan Gilbert 
3970e4d63320SDr. David Alan Gilbert             /* We require that the blocks are in the same order */
3971e4d63320SDr. David Alan Gilbert             if (rdma->dest_blocks[i].length != local->block[i].length) {
39729cde9caaSMarkus Armbruster                 fprintf(stderr, "Block %s/%d has a different length %" PRIu64
3973e4d63320SDr. David Alan Gilbert                         "vs %" PRIu64, local->block[i].block_name, i,
3974e4d63320SDr. David Alan Gilbert                         local->block[i].length,
3975e4d63320SDr. David Alan Gilbert                         rdma->dest_blocks[i].length);
3976ef4b722dSDr. David Alan Gilbert                 rdma->error_state = -EINVAL;
3977329c9b10SDr. David Alan Gilbert                 return -EINVAL;
3978329c9b10SDr. David Alan Gilbert             }
3979e4d63320SDr. David Alan Gilbert             local->block[i].remote_host_addr =
3980a97270adSDr. David Alan Gilbert                     rdma->dest_blocks[i].remote_host_addr;
3981e4d63320SDr. David Alan Gilbert             local->block[i].remote_rkey = rdma->dest_blocks[i].remote_rkey;
3982329c9b10SDr. David Alan Gilbert         }
3983329c9b10SDr. David Alan Gilbert     }
3984329c9b10SDr. David Alan Gilbert 
3985733252deSDr. David Alan Gilbert     trace_qemu_rdma_registration_stop(flags);
3986329c9b10SDr. David Alan Gilbert 
3987329c9b10SDr. David Alan Gilbert     head.type = RDMA_CONTROL_REGISTER_FINISHED;
3988329c9b10SDr. David Alan Gilbert     ret = qemu_rdma_exchange_send(rdma, &head, NULL, NULL, NULL, NULL);
3989329c9b10SDr. David Alan Gilbert 
3990329c9b10SDr. David Alan Gilbert     if (ret < 0) {
3991329c9b10SDr. David Alan Gilbert         goto err;
3992329c9b10SDr. David Alan Gilbert     }
3993329c9b10SDr. David Alan Gilbert 
3994329c9b10SDr. David Alan Gilbert     return 0;
3995329c9b10SDr. David Alan Gilbert err:
3996329c9b10SDr. David Alan Gilbert     rdma->error_state = ret;
3997329c9b10SDr. David Alan Gilbert     return ret;
3998329c9b10SDr. David Alan Gilbert }
3999329c9b10SDr. David Alan Gilbert 
40000436e09fSDaniel P. Berrange static const QEMUFileHooks rdma_read_hooks = {
4001632e3a5cSDr. David Alan Gilbert     .hook_ram_load = rdma_load_hook,
4002329c9b10SDr. David Alan Gilbert };
4003329c9b10SDr. David Alan Gilbert 
40040436e09fSDaniel P. Berrange static const QEMUFileHooks rdma_write_hooks = {
4005329c9b10SDr. David Alan Gilbert     .before_ram_iterate = qemu_rdma_registration_start,
4006329c9b10SDr. David Alan Gilbert     .after_ram_iterate  = qemu_rdma_registration_stop,
4007329c9b10SDr. David Alan Gilbert     .save_page          = qemu_rdma_save_page,
4008329c9b10SDr. David Alan Gilbert };
4009329c9b10SDr. David Alan Gilbert 
40106ddd2d76SDaniel P. Berrange 
40116ddd2d76SDaniel P. Berrange static void qio_channel_rdma_finalize(Object *obj)
4012329c9b10SDr. David Alan Gilbert {
40136ddd2d76SDaniel P. Berrange     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(obj);
401474637e6fSLidong Chen     if (rioc->rdmain) {
401574637e6fSLidong Chen         qemu_rdma_cleanup(rioc->rdmain);
401674637e6fSLidong Chen         g_free(rioc->rdmain);
401774637e6fSLidong Chen         rioc->rdmain = NULL;
401874637e6fSLidong Chen     }
401974637e6fSLidong Chen     if (rioc->rdmaout) {
402074637e6fSLidong Chen         qemu_rdma_cleanup(rioc->rdmaout);
402174637e6fSLidong Chen         g_free(rioc->rdmaout);
402274637e6fSLidong Chen         rioc->rdmaout = NULL;
40236ddd2d76SDaniel P. Berrange     }
40246ddd2d76SDaniel P. Berrange }
40256ddd2d76SDaniel P. Berrange 
40266ddd2d76SDaniel P. Berrange static void qio_channel_rdma_class_init(ObjectClass *klass,
40276ddd2d76SDaniel P. Berrange                                         void *class_data G_GNUC_UNUSED)
40286ddd2d76SDaniel P. Berrange {
40296ddd2d76SDaniel P. Berrange     QIOChannelClass *ioc_klass = QIO_CHANNEL_CLASS(klass);
40306ddd2d76SDaniel P. Berrange 
40316ddd2d76SDaniel P. Berrange     ioc_klass->io_writev = qio_channel_rdma_writev;
40326ddd2d76SDaniel P. Berrange     ioc_klass->io_readv = qio_channel_rdma_readv;
40336ddd2d76SDaniel P. Berrange     ioc_klass->io_set_blocking = qio_channel_rdma_set_blocking;
40346ddd2d76SDaniel P. Berrange     ioc_klass->io_close = qio_channel_rdma_close;
40356ddd2d76SDaniel P. Berrange     ioc_klass->io_create_watch = qio_channel_rdma_create_watch;
40364d9f675bSLidong Chen     ioc_klass->io_set_aio_fd_handler = qio_channel_rdma_set_aio_fd_handler;
403754db882fSLidong Chen     ioc_klass->io_shutdown = qio_channel_rdma_shutdown;
40386ddd2d76SDaniel P. Berrange }
40396ddd2d76SDaniel P. Berrange 
40406ddd2d76SDaniel P. Berrange static const TypeInfo qio_channel_rdma_info = {
40416ddd2d76SDaniel P. Berrange     .parent = TYPE_QIO_CHANNEL,
40426ddd2d76SDaniel P. Berrange     .name = TYPE_QIO_CHANNEL_RDMA,
40436ddd2d76SDaniel P. Berrange     .instance_size = sizeof(QIOChannelRDMA),
40446ddd2d76SDaniel P. Berrange     .instance_finalize = qio_channel_rdma_finalize,
40456ddd2d76SDaniel P. Berrange     .class_init = qio_channel_rdma_class_init,
40466ddd2d76SDaniel P. Berrange };
40476ddd2d76SDaniel P. Berrange 
40486ddd2d76SDaniel P. Berrange static void qio_channel_rdma_register_types(void)
40496ddd2d76SDaniel P. Berrange {
40506ddd2d76SDaniel P. Berrange     type_register_static(&qio_channel_rdma_info);
40516ddd2d76SDaniel P. Berrange }
40526ddd2d76SDaniel P. Berrange 
40536ddd2d76SDaniel P. Berrange type_init(qio_channel_rdma_register_types);
40546ddd2d76SDaniel P. Berrange 
40556ddd2d76SDaniel P. Berrange static QEMUFile *qemu_fopen_rdma(RDMAContext *rdma, const char *mode)
40566ddd2d76SDaniel P. Berrange {
40576ddd2d76SDaniel P. Berrange     QIOChannelRDMA *rioc;
4058329c9b10SDr. David Alan Gilbert 
4059329c9b10SDr. David Alan Gilbert     if (qemu_file_mode_is_not_valid(mode)) {
4060329c9b10SDr. David Alan Gilbert         return NULL;
4061329c9b10SDr. David Alan Gilbert     }
4062329c9b10SDr. David Alan Gilbert 
40636ddd2d76SDaniel P. Berrange     rioc = QIO_CHANNEL_RDMA(object_new(TYPE_QIO_CHANNEL_RDMA));
4064329c9b10SDr. David Alan Gilbert 
4065329c9b10SDr. David Alan Gilbert     if (mode[0] == 'w') {
406677ef2dc1SDaniel P. Berrangé         rioc->file = qemu_file_new_output(QIO_CHANNEL(rioc));
406774637e6fSLidong Chen         rioc->rdmaout = rdma;
406874637e6fSLidong Chen         rioc->rdmain = rdma->return_path;
40696ddd2d76SDaniel P. Berrange         qemu_file_set_hooks(rioc->file, &rdma_write_hooks);
4070329c9b10SDr. David Alan Gilbert     } else {
407177ef2dc1SDaniel P. Berrangé         rioc->file = qemu_file_new_input(QIO_CHANNEL(rioc));
407274637e6fSLidong Chen         rioc->rdmain = rdma;
407374637e6fSLidong Chen         rioc->rdmaout = rdma->return_path;
40746ddd2d76SDaniel P. Berrange         qemu_file_set_hooks(rioc->file, &rdma_read_hooks);
4075329c9b10SDr. David Alan Gilbert     }
4076329c9b10SDr. David Alan Gilbert 
40776ddd2d76SDaniel P. Berrange     return rioc->file;
4078329c9b10SDr. David Alan Gilbert }
4079329c9b10SDr. David Alan Gilbert 
4080329c9b10SDr. David Alan Gilbert static void rdma_accept_incoming_migration(void *opaque)
4081329c9b10SDr. David Alan Gilbert {
4082329c9b10SDr. David Alan Gilbert     RDMAContext *rdma = opaque;
4083329c9b10SDr. David Alan Gilbert     int ret;
4084329c9b10SDr. David Alan Gilbert     QEMUFile *f;
40852a1bc8bdSDr. David Alan Gilbert     Error *local_err = NULL;
4086329c9b10SDr. David Alan Gilbert 
408724ec68efSDr. David Alan Gilbert     trace_qemu_rdma_accept_incoming_migration();
4088329c9b10SDr. David Alan Gilbert     ret = qemu_rdma_accept(rdma);
4089329c9b10SDr. David Alan Gilbert 
4090329c9b10SDr. David Alan Gilbert     if (ret) {
40912a1bc8bdSDr. David Alan Gilbert         fprintf(stderr, "RDMA ERROR: Migration initialization failed\n");
4092329c9b10SDr. David Alan Gilbert         return;
4093329c9b10SDr. David Alan Gilbert     }
4094329c9b10SDr. David Alan Gilbert 
409524ec68efSDr. David Alan Gilbert     trace_qemu_rdma_accept_incoming_migration_accepted();
4096329c9b10SDr. David Alan Gilbert 
409755cc1b59SLidong Chen     if (rdma->is_return_path) {
409855cc1b59SLidong Chen         return;
409955cc1b59SLidong Chen     }
410055cc1b59SLidong Chen 
4101329c9b10SDr. David Alan Gilbert     f = qemu_fopen_rdma(rdma, "rb");
4102329c9b10SDr. David Alan Gilbert     if (f == NULL) {
41032a1bc8bdSDr. David Alan Gilbert         fprintf(stderr, "RDMA ERROR: could not qemu_fopen_rdma\n");
4104329c9b10SDr. David Alan Gilbert         qemu_rdma_cleanup(rdma);
4105329c9b10SDr. David Alan Gilbert         return;
4106329c9b10SDr. David Alan Gilbert     }
4107329c9b10SDr. David Alan Gilbert 
4108329c9b10SDr. David Alan Gilbert     rdma->migration_started_on_destination = 1;
41092a1bc8bdSDr. David Alan Gilbert     migration_fd_process_incoming(f, &local_err);
41102a1bc8bdSDr. David Alan Gilbert     if (local_err) {
41112a1bc8bdSDr. David Alan Gilbert         error_reportf_err(local_err, "RDMA ERROR:");
41122a1bc8bdSDr. David Alan Gilbert     }
4113329c9b10SDr. David Alan Gilbert }
4114329c9b10SDr. David Alan Gilbert 
4115329c9b10SDr. David Alan Gilbert void rdma_start_incoming_migration(const char *host_port, Error **errp)
4116329c9b10SDr. David Alan Gilbert {
4117329c9b10SDr. David Alan Gilbert     int ret;
4118bf027419SLi Zhijian     RDMAContext *rdma;
4119329c9b10SDr. David Alan Gilbert     Error *local_err = NULL;
4120329c9b10SDr. David Alan Gilbert 
4121733252deSDr. David Alan Gilbert     trace_rdma_start_incoming_migration();
4122329c9b10SDr. David Alan Gilbert 
41235f1f1902SDavid Hildenbrand     /* Avoid ram_block_discard_disable(), cannot change during migration. */
41245f1f1902SDavid Hildenbrand     if (ram_block_discard_is_required()) {
41255f1f1902SDavid Hildenbrand         error_setg(errp, "RDMA: cannot disable RAM discard");
41265f1f1902SDavid Hildenbrand         return;
41275f1f1902SDavid Hildenbrand     }
41285f1f1902SDavid Hildenbrand 
41295f1f1902SDavid Hildenbrand     rdma = qemu_rdma_data_init(host_port, &local_err);
4130329c9b10SDr. David Alan Gilbert     if (rdma == NULL) {
4131329c9b10SDr. David Alan Gilbert         goto err;
4132329c9b10SDr. David Alan Gilbert     }
4133329c9b10SDr. David Alan Gilbert 
4134329c9b10SDr. David Alan Gilbert     ret = qemu_rdma_dest_init(rdma, &local_err);
4135329c9b10SDr. David Alan Gilbert 
4136329c9b10SDr. David Alan Gilbert     if (ret) {
4137329c9b10SDr. David Alan Gilbert         goto err;
4138329c9b10SDr. David Alan Gilbert     }
4139329c9b10SDr. David Alan Gilbert 
4140733252deSDr. David Alan Gilbert     trace_rdma_start_incoming_migration_after_dest_init();
4141329c9b10SDr. David Alan Gilbert 
4142329c9b10SDr. David Alan Gilbert     ret = rdma_listen(rdma->listen_id, 5);
4143329c9b10SDr. David Alan Gilbert 
4144329c9b10SDr. David Alan Gilbert     if (ret) {
4145329c9b10SDr. David Alan Gilbert         ERROR(errp, "listening on socket!");
41464e812d23SLi Zhijian         goto cleanup_rdma;
4147329c9b10SDr. David Alan Gilbert     }
4148329c9b10SDr. David Alan Gilbert 
4149733252deSDr. David Alan Gilbert     trace_rdma_start_incoming_migration_after_rdma_listen();
4150329c9b10SDr. David Alan Gilbert 
415182e1cc4bSFam Zheng     qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
415282e1cc4bSFam Zheng                         NULL, (void *)(intptr_t)rdma);
4153329c9b10SDr. David Alan Gilbert     return;
41544e812d23SLi Zhijian 
41554e812d23SLi Zhijian cleanup_rdma:
41564e812d23SLi Zhijian     qemu_rdma_cleanup(rdma);
4157329c9b10SDr. David Alan Gilbert err:
4158329c9b10SDr. David Alan Gilbert     error_propagate(errp, local_err);
41593b59ee72SPan Nengyuan     if (rdma) {
416059c59c67SPan Nengyuan         g_free(rdma->host);
416144bcfd45SLi Zhijian         g_free(rdma->host_port);
41623b59ee72SPan Nengyuan     }
4163329c9b10SDr. David Alan Gilbert     g_free(rdma);
4164329c9b10SDr. David Alan Gilbert }
4165329c9b10SDr. David Alan Gilbert 
4166329c9b10SDr. David Alan Gilbert void rdma_start_outgoing_migration(void *opaque,
4167329c9b10SDr. David Alan Gilbert                             const char *host_port, Error **errp)
4168329c9b10SDr. David Alan Gilbert {
4169329c9b10SDr. David Alan Gilbert     MigrationState *s = opaque;
417055cc1b59SLidong Chen     RDMAContext *rdma_return_path = NULL;
41715f1f1902SDavid Hildenbrand     RDMAContext *rdma;
4172329c9b10SDr. David Alan Gilbert     int ret = 0;
4173329c9b10SDr. David Alan Gilbert 
41745f1f1902SDavid Hildenbrand     /* Avoid ram_block_discard_disable(), cannot change during migration. */
41755f1f1902SDavid Hildenbrand     if (ram_block_discard_is_required()) {
41765f1f1902SDavid Hildenbrand         error_setg(errp, "RDMA: cannot disable RAM discard");
41775f1f1902SDavid Hildenbrand         return;
41785f1f1902SDavid Hildenbrand     }
41795f1f1902SDavid Hildenbrand 
41805f1f1902SDavid Hildenbrand     rdma = qemu_rdma_data_init(host_port, errp);
4181329c9b10SDr. David Alan Gilbert     if (rdma == NULL) {
4182329c9b10SDr. David Alan Gilbert         goto err;
4183329c9b10SDr. David Alan Gilbert     }
4184329c9b10SDr. David Alan Gilbert 
418517cba690SJuan Quintela     ret = qemu_rdma_source_init(rdma, migrate_rdma_pin_all(), errp);
4186329c9b10SDr. David Alan Gilbert 
4187329c9b10SDr. David Alan Gilbert     if (ret) {
4188329c9b10SDr. David Alan Gilbert         goto err;
4189329c9b10SDr. David Alan Gilbert     }
4190329c9b10SDr. David Alan Gilbert 
4191733252deSDr. David Alan Gilbert     trace_rdma_start_outgoing_migration_after_rdma_source_init();
4192e49e49ddSLi Zhijian     ret = qemu_rdma_connect(rdma, errp, false);
4193329c9b10SDr. David Alan Gilbert 
4194329c9b10SDr. David Alan Gilbert     if (ret) {
4195329c9b10SDr. David Alan Gilbert         goto err;
4196329c9b10SDr. David Alan Gilbert     }
4197329c9b10SDr. David Alan Gilbert 
41983a4452d8Szhaolichang     /* RDMA postcopy need a separate queue pair for return path */
419938ad1110SJuan Quintela     if (migrate_postcopy() || migrate_return_path()) {
420055cc1b59SLidong Chen         rdma_return_path = qemu_rdma_data_init(host_port, errp);
420155cc1b59SLidong Chen 
420255cc1b59SLidong Chen         if (rdma_return_path == NULL) {
42032f0c285aSPan Nengyuan             goto return_path_err;
420455cc1b59SLidong Chen         }
420555cc1b59SLidong Chen 
420655cc1b59SLidong Chen         ret = qemu_rdma_source_init(rdma_return_path,
420717cba690SJuan Quintela                                     migrate_rdma_pin_all(), errp);
420855cc1b59SLidong Chen 
420955cc1b59SLidong Chen         if (ret) {
42102f0c285aSPan Nengyuan             goto return_path_err;
421155cc1b59SLidong Chen         }
421255cc1b59SLidong Chen 
4213e49e49ddSLi Zhijian         ret = qemu_rdma_connect(rdma_return_path, errp, true);
421455cc1b59SLidong Chen 
421555cc1b59SLidong Chen         if (ret) {
42162f0c285aSPan Nengyuan             goto return_path_err;
421755cc1b59SLidong Chen         }
421855cc1b59SLidong Chen 
421955cc1b59SLidong Chen         rdma->return_path = rdma_return_path;
422055cc1b59SLidong Chen         rdma_return_path->return_path = rdma;
422155cc1b59SLidong Chen         rdma_return_path->is_return_path = true;
422255cc1b59SLidong Chen     }
422355cc1b59SLidong Chen 
4224733252deSDr. David Alan Gilbert     trace_rdma_start_outgoing_migration_after_rdma_connect();
4225329c9b10SDr. David Alan Gilbert 
422689a02a9fSzhanghailiang     s->to_dst_file = qemu_fopen_rdma(rdma, "wb");
4227cce8040bSDr. David Alan Gilbert     migrate_fd_connect(s, NULL);
4228329c9b10SDr. David Alan Gilbert     return;
42292f0c285aSPan Nengyuan return_path_err:
42302f0c285aSPan Nengyuan     qemu_rdma_cleanup(rdma);
4231329c9b10SDr. David Alan Gilbert err:
4232329c9b10SDr. David Alan Gilbert     g_free(rdma);
423355cc1b59SLidong Chen     g_free(rdma_return_path);
4234329c9b10SDr. David Alan Gilbert }
4235