1329c9b10SDr. David Alan Gilbert /*
2329c9b10SDr. David Alan Gilbert * RDMA protocol and interfaces
3329c9b10SDr. David Alan Gilbert *
4329c9b10SDr. David Alan Gilbert * Copyright IBM, Corp. 2010-2013
56ddd2d76SDaniel P. Berrange * Copyright Red Hat, Inc. 2015-2016
6329c9b10SDr. David Alan Gilbert *
7329c9b10SDr. David Alan Gilbert * Authors:
8329c9b10SDr. David Alan Gilbert * Michael R. Hines <mrhines@us.ibm.com>
9329c9b10SDr. David Alan Gilbert * Jiuxing Liu <jl@us.ibm.com>
106ddd2d76SDaniel P. Berrange * Daniel P. Berrange <berrange@redhat.com>
11329c9b10SDr. David Alan Gilbert *
12329c9b10SDr. David Alan Gilbert * This work is licensed under the terms of the GNU GPL, version 2 or
13329c9b10SDr. David Alan Gilbert * later. See the COPYING file in the top-level directory.
14329c9b10SDr. David Alan Gilbert *
15329c9b10SDr. David Alan Gilbert */
160b8fa32fSMarkus Armbruster
171393a485SPeter Maydell #include "qemu/osdep.h"
18da34e65cSMarkus Armbruster #include "qapi/error.h"
19f348b6d1SVeronia Bahaa #include "qemu/cutils.h"
20c61d2faaSJuan Quintela #include "exec/target_page.h"
21e1a3eceeSJuan Quintela #include "rdma.h"
226666c96aSJuan Quintela #include "migration.h"
23c61d2faaSJuan Quintela #include "migration-stats.h"
2408a0aee1SJuan Quintela #include "qemu-file.h"
257b1e1a22SJuan Quintela #include "ram.h"
26d49b6836SMarkus Armbruster #include "qemu/error-report.h"
27329c9b10SDr. David Alan Gilbert #include "qemu/main-loop.h"
280b8fa32fSMarkus Armbruster #include "qemu/module.h"
29d4842052SMarkus Armbruster #include "qemu/rcu.h"
30329c9b10SDr. David Alan Gilbert #include "qemu/sockets.h"
31329c9b10SDr. David Alan Gilbert #include "qemu/bitmap.h"
3210817bf0SDaniel P. Berrange #include "qemu/coroutine.h"
335f1f1902SDavid Hildenbrand #include "exec/memory.h"
34329c9b10SDr. David Alan Gilbert #include <sys/socket.h>
35329c9b10SDr. David Alan Gilbert #include <netdb.h>
36329c9b10SDr. David Alan Gilbert #include <arpa/inet.h>
37329c9b10SDr. David Alan Gilbert #include <rdma/rdma_cma.h>
38733252deSDr. David Alan Gilbert #include "trace.h"
39db1015e9SEduardo Habkost #include "qom/object.h"
4017cba690SJuan Quintela #include "options.h"
41e49e49ddSLi Zhijian #include <poll.h>
42329c9b10SDr. David Alan Gilbert
43329c9b10SDr. David Alan Gilbert #define RDMA_RESOLVE_TIMEOUT_MS 10000
44329c9b10SDr. David Alan Gilbert
45329c9b10SDr. David Alan Gilbert /* Do not merge data if larger than this. */
46329c9b10SDr. David Alan Gilbert #define RDMA_MERGE_MAX (2 * 1024 * 1024)
47329c9b10SDr. David Alan Gilbert #define RDMA_SIGNALED_SEND_MAX (RDMA_MERGE_MAX / 4096)
48329c9b10SDr. David Alan Gilbert
49329c9b10SDr. David Alan Gilbert #define RDMA_REG_CHUNK_SHIFT 20 /* 1 MB */
50329c9b10SDr. David Alan Gilbert
51329c9b10SDr. David Alan Gilbert /*
52329c9b10SDr. David Alan Gilbert * This is only for non-live state being migrated.
53329c9b10SDr. David Alan Gilbert * Instead of RDMA_WRITE messages, we use RDMA_SEND
54329c9b10SDr. David Alan Gilbert * messages for that state, which requires a different
55329c9b10SDr. David Alan Gilbert * delivery design than main memory.
56329c9b10SDr. David Alan Gilbert */
57329c9b10SDr. David Alan Gilbert #define RDMA_SEND_INCREMENT 32768
58329c9b10SDr. David Alan Gilbert
59329c9b10SDr. David Alan Gilbert /*
60329c9b10SDr. David Alan Gilbert * Maximum size infiniband SEND message
61329c9b10SDr. David Alan Gilbert */
62329c9b10SDr. David Alan Gilbert #define RDMA_CONTROL_MAX_BUFFER (512 * 1024)
63329c9b10SDr. David Alan Gilbert #define RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE 4096
64329c9b10SDr. David Alan Gilbert
65329c9b10SDr. David Alan Gilbert #define RDMA_CONTROL_VERSION_CURRENT 1
66329c9b10SDr. David Alan Gilbert /*
67329c9b10SDr. David Alan Gilbert * Capabilities for negotiation.
68329c9b10SDr. David Alan Gilbert */
69329c9b10SDr. David Alan Gilbert #define RDMA_CAPABILITY_PIN_ALL 0x01
70329c9b10SDr. David Alan Gilbert
71329c9b10SDr. David Alan Gilbert /*
72329c9b10SDr. David Alan Gilbert * Add the other flags above to this list of known capabilities
73329c9b10SDr. David Alan Gilbert * as they are introduced.
74329c9b10SDr. David Alan Gilbert */
75329c9b10SDr. David Alan Gilbert static uint32_t known_capabilities = RDMA_CAPABILITY_PIN_ALL;
76329c9b10SDr. David Alan Gilbert
77329c9b10SDr. David Alan Gilbert /*
78329c9b10SDr. David Alan Gilbert * A work request ID is 64-bits and we split up these bits
79329c9b10SDr. David Alan Gilbert * into 3 parts:
80329c9b10SDr. David Alan Gilbert *
81329c9b10SDr. David Alan Gilbert * bits 0-15 : type of control message, 2^16
82329c9b10SDr. David Alan Gilbert * bits 16-29: ram block index, 2^14
83329c9b10SDr. David Alan Gilbert * bits 30-63: ram block chunk number, 2^34
84329c9b10SDr. David Alan Gilbert *
85329c9b10SDr. David Alan Gilbert * The last two bit ranges are only used for RDMA writes,
86329c9b10SDr. David Alan Gilbert * in order to track their completion and potentially
87329c9b10SDr. David Alan Gilbert * also track unregistration status of the message.
88329c9b10SDr. David Alan Gilbert */
89329c9b10SDr. David Alan Gilbert #define RDMA_WRID_TYPE_SHIFT 0UL
90329c9b10SDr. David Alan Gilbert #define RDMA_WRID_BLOCK_SHIFT 16UL
91329c9b10SDr. David Alan Gilbert #define RDMA_WRID_CHUNK_SHIFT 30UL
92329c9b10SDr. David Alan Gilbert
93329c9b10SDr. David Alan Gilbert #define RDMA_WRID_TYPE_MASK \
94329c9b10SDr. David Alan Gilbert ((1UL << RDMA_WRID_BLOCK_SHIFT) - 1UL)
95329c9b10SDr. David Alan Gilbert
96329c9b10SDr. David Alan Gilbert #define RDMA_WRID_BLOCK_MASK \
97329c9b10SDr. David Alan Gilbert (~RDMA_WRID_TYPE_MASK & ((1UL << RDMA_WRID_CHUNK_SHIFT) - 1UL))
98329c9b10SDr. David Alan Gilbert
99329c9b10SDr. David Alan Gilbert #define RDMA_WRID_CHUNK_MASK (~RDMA_WRID_BLOCK_MASK & ~RDMA_WRID_TYPE_MASK)
100329c9b10SDr. David Alan Gilbert
101329c9b10SDr. David Alan Gilbert /*
102329c9b10SDr. David Alan Gilbert * RDMA migration protocol:
103329c9b10SDr. David Alan Gilbert * 1. RDMA Writes (data messages, i.e. RAM)
104329c9b10SDr. David Alan Gilbert * 2. IB Send/Recv (control channel messages)
105329c9b10SDr. David Alan Gilbert */
106329c9b10SDr. David Alan Gilbert enum {
107329c9b10SDr. David Alan Gilbert RDMA_WRID_NONE = 0,
108329c9b10SDr. David Alan Gilbert RDMA_WRID_RDMA_WRITE = 1,
109329c9b10SDr. David Alan Gilbert RDMA_WRID_SEND_CONTROL = 2000,
110329c9b10SDr. David Alan Gilbert RDMA_WRID_RECV_CONTROL = 4000,
111329c9b10SDr. David Alan Gilbert };
112329c9b10SDr. David Alan Gilbert
113329c9b10SDr. David Alan Gilbert /*
114329c9b10SDr. David Alan Gilbert * Work request IDs for IB SEND messages only (not RDMA writes).
115329c9b10SDr. David Alan Gilbert * This is used by the migration protocol to transmit
116329c9b10SDr. David Alan Gilbert * control messages (such as device state and registration commands)
117329c9b10SDr. David Alan Gilbert *
118329c9b10SDr. David Alan Gilbert * We could use more WRs, but we have enough for now.
119329c9b10SDr. David Alan Gilbert */
120329c9b10SDr. David Alan Gilbert enum {
121329c9b10SDr. David Alan Gilbert RDMA_WRID_READY = 0,
122329c9b10SDr. David Alan Gilbert RDMA_WRID_DATA,
123329c9b10SDr. David Alan Gilbert RDMA_WRID_CONTROL,
124329c9b10SDr. David Alan Gilbert RDMA_WRID_MAX,
125329c9b10SDr. David Alan Gilbert };
126329c9b10SDr. David Alan Gilbert
127329c9b10SDr. David Alan Gilbert /*
128329c9b10SDr. David Alan Gilbert * SEND/RECV IB Control Messages.
129329c9b10SDr. David Alan Gilbert */
130329c9b10SDr. David Alan Gilbert enum {
131329c9b10SDr. David Alan Gilbert RDMA_CONTROL_NONE = 0,
132329c9b10SDr. David Alan Gilbert RDMA_CONTROL_ERROR,
133329c9b10SDr. David Alan Gilbert RDMA_CONTROL_READY, /* ready to receive */
134329c9b10SDr. David Alan Gilbert RDMA_CONTROL_QEMU_FILE, /* QEMUFile-transmitted bytes */
135329c9b10SDr. David Alan Gilbert RDMA_CONTROL_RAM_BLOCKS_REQUEST, /* RAMBlock synchronization */
136329c9b10SDr. David Alan Gilbert RDMA_CONTROL_RAM_BLOCKS_RESULT, /* RAMBlock synchronization */
137329c9b10SDr. David Alan Gilbert RDMA_CONTROL_COMPRESS, /* page contains repeat values */
138329c9b10SDr. David Alan Gilbert RDMA_CONTROL_REGISTER_REQUEST, /* dynamic page registration */
139329c9b10SDr. David Alan Gilbert RDMA_CONTROL_REGISTER_RESULT, /* key to use after registration */
140329c9b10SDr. David Alan Gilbert RDMA_CONTROL_REGISTER_FINISHED, /* current iteration finished */
141329c9b10SDr. David Alan Gilbert RDMA_CONTROL_UNREGISTER_REQUEST, /* dynamic UN-registration */
142329c9b10SDr. David Alan Gilbert RDMA_CONTROL_UNREGISTER_FINISHED, /* unpinning finished */
143329c9b10SDr. David Alan Gilbert };
144329c9b10SDr. David Alan Gilbert
145329c9b10SDr. David Alan Gilbert
146329c9b10SDr. David Alan Gilbert /*
147329c9b10SDr. David Alan Gilbert * Memory and MR structures used to represent an IB Send/Recv work request.
148329c9b10SDr. David Alan Gilbert * This is *not* used for RDMA writes, only IB Send/Recv.
149329c9b10SDr. David Alan Gilbert */
150329c9b10SDr. David Alan Gilbert typedef struct {
151329c9b10SDr. David Alan Gilbert uint8_t control[RDMA_CONTROL_MAX_BUFFER]; /* actual buffer to register */
152329c9b10SDr. David Alan Gilbert struct ibv_mr *control_mr; /* registration metadata */
153329c9b10SDr. David Alan Gilbert size_t control_len; /* length of the message */
154329c9b10SDr. David Alan Gilbert uint8_t *control_curr; /* start of unconsumed bytes */
155329c9b10SDr. David Alan Gilbert } RDMAWorkRequestData;
156329c9b10SDr. David Alan Gilbert
157329c9b10SDr. David Alan Gilbert /*
158329c9b10SDr. David Alan Gilbert * Negotiate RDMA capabilities during connection-setup time.
159329c9b10SDr. David Alan Gilbert */
160329c9b10SDr. David Alan Gilbert typedef struct {
161329c9b10SDr. David Alan Gilbert uint32_t version;
162329c9b10SDr. David Alan Gilbert uint32_t flags;
163329c9b10SDr. David Alan Gilbert } RDMACapabilities;
164329c9b10SDr. David Alan Gilbert
caps_to_network(RDMACapabilities * cap)165329c9b10SDr. David Alan Gilbert static void caps_to_network(RDMACapabilities *cap)
166329c9b10SDr. David Alan Gilbert {
167329c9b10SDr. David Alan Gilbert cap->version = htonl(cap->version);
168329c9b10SDr. David Alan Gilbert cap->flags = htonl(cap->flags);
169329c9b10SDr. David Alan Gilbert }
170329c9b10SDr. David Alan Gilbert
network_to_caps(RDMACapabilities * cap)171329c9b10SDr. David Alan Gilbert static void network_to_caps(RDMACapabilities *cap)
172329c9b10SDr. David Alan Gilbert {
173329c9b10SDr. David Alan Gilbert cap->version = ntohl(cap->version);
174329c9b10SDr. David Alan Gilbert cap->flags = ntohl(cap->flags);
175329c9b10SDr. David Alan Gilbert }
176329c9b10SDr. David Alan Gilbert
177329c9b10SDr. David Alan Gilbert /*
178329c9b10SDr. David Alan Gilbert * Representation of a RAMBlock from an RDMA perspective.
179329c9b10SDr. David Alan Gilbert * This is not transmitted, only local.
180329c9b10SDr. David Alan Gilbert * This and subsequent structures cannot be linked lists
181329c9b10SDr. David Alan Gilbert * because we're using a single IB message to transmit
182329c9b10SDr. David Alan Gilbert * the information. It's small anyway, so a list is overkill.
183329c9b10SDr. David Alan Gilbert */
184329c9b10SDr. David Alan Gilbert typedef struct RDMALocalBlock {
1854fb5364bSDr. David Alan Gilbert char *block_name;
186329c9b10SDr. David Alan Gilbert uint8_t *local_host_addr; /* local virtual address */
187329c9b10SDr. David Alan Gilbert uint64_t remote_host_addr; /* remote virtual address */
188329c9b10SDr. David Alan Gilbert uint64_t offset;
189329c9b10SDr. David Alan Gilbert uint64_t length;
190329c9b10SDr. David Alan Gilbert struct ibv_mr **pmr; /* MRs for chunk-level registration */
191329c9b10SDr. David Alan Gilbert struct ibv_mr *mr; /* MR for non-chunk-level registration */
192329c9b10SDr. David Alan Gilbert uint32_t *remote_keys; /* rkeys for chunk-level registration */
193329c9b10SDr. David Alan Gilbert uint32_t remote_rkey; /* rkeys for non-chunk-level registration */
194329c9b10SDr. David Alan Gilbert int index; /* which block are we */
195e4d63320SDr. David Alan Gilbert unsigned int src_index; /* (Only used on dest) */
196329c9b10SDr. David Alan Gilbert bool is_ram_block;
197329c9b10SDr. David Alan Gilbert int nb_chunks;
198329c9b10SDr. David Alan Gilbert unsigned long *transit_bitmap;
199329c9b10SDr. David Alan Gilbert unsigned long *unregister_bitmap;
200329c9b10SDr. David Alan Gilbert } RDMALocalBlock;
201329c9b10SDr. David Alan Gilbert
202329c9b10SDr. David Alan Gilbert /*
203329c9b10SDr. David Alan Gilbert * Also represents a RAMblock, but only on the dest.
204329c9b10SDr. David Alan Gilbert * This gets transmitted by the dest during connection-time
205329c9b10SDr. David Alan Gilbert * to the source VM and then is used to populate the
206329c9b10SDr. David Alan Gilbert * corresponding RDMALocalBlock with
207329c9b10SDr. David Alan Gilbert * the information needed to perform the actual RDMA.
208329c9b10SDr. David Alan Gilbert */
209a97270adSDr. David Alan Gilbert typedef struct QEMU_PACKED RDMADestBlock {
210329c9b10SDr. David Alan Gilbert uint64_t remote_host_addr;
211329c9b10SDr. David Alan Gilbert uint64_t offset;
212329c9b10SDr. David Alan Gilbert uint64_t length;
213329c9b10SDr. David Alan Gilbert uint32_t remote_rkey;
214329c9b10SDr. David Alan Gilbert uint32_t padding;
215a97270adSDr. David Alan Gilbert } RDMADestBlock;
216329c9b10SDr. David Alan Gilbert
control_desc(unsigned int rdma_control)217482a33c5SDr. David Alan Gilbert static const char *control_desc(unsigned int rdma_control)
218482a33c5SDr. David Alan Gilbert {
219482a33c5SDr. David Alan Gilbert static const char *strs[] = {
220482a33c5SDr. David Alan Gilbert [RDMA_CONTROL_NONE] = "NONE",
221482a33c5SDr. David Alan Gilbert [RDMA_CONTROL_ERROR] = "ERROR",
222482a33c5SDr. David Alan Gilbert [RDMA_CONTROL_READY] = "READY",
223482a33c5SDr. David Alan Gilbert [RDMA_CONTROL_QEMU_FILE] = "QEMU FILE",
224482a33c5SDr. David Alan Gilbert [RDMA_CONTROL_RAM_BLOCKS_REQUEST] = "RAM BLOCKS REQUEST",
225482a33c5SDr. David Alan Gilbert [RDMA_CONTROL_RAM_BLOCKS_RESULT] = "RAM BLOCKS RESULT",
226482a33c5SDr. David Alan Gilbert [RDMA_CONTROL_COMPRESS] = "COMPRESS",
227482a33c5SDr. David Alan Gilbert [RDMA_CONTROL_REGISTER_REQUEST] = "REGISTER REQUEST",
228482a33c5SDr. David Alan Gilbert [RDMA_CONTROL_REGISTER_RESULT] = "REGISTER RESULT",
229482a33c5SDr. David Alan Gilbert [RDMA_CONTROL_REGISTER_FINISHED] = "REGISTER FINISHED",
230482a33c5SDr. David Alan Gilbert [RDMA_CONTROL_UNREGISTER_REQUEST] = "UNREGISTER REQUEST",
231482a33c5SDr. David Alan Gilbert [RDMA_CONTROL_UNREGISTER_FINISHED] = "UNREGISTER FINISHED",
232482a33c5SDr. David Alan Gilbert };
233482a33c5SDr. David Alan Gilbert
234482a33c5SDr. David Alan Gilbert if (rdma_control > RDMA_CONTROL_UNREGISTER_FINISHED) {
235482a33c5SDr. David Alan Gilbert return "??BAD CONTROL VALUE??";
236482a33c5SDr. David Alan Gilbert }
237482a33c5SDr. David Alan Gilbert
238482a33c5SDr. David Alan Gilbert return strs[rdma_control];
239482a33c5SDr. David Alan Gilbert }
240482a33c5SDr. David Alan Gilbert
24144ce1b5dSNick Briggs #if !defined(htonll)
htonll(uint64_t v)242329c9b10SDr. David Alan Gilbert static uint64_t htonll(uint64_t v)
243329c9b10SDr. David Alan Gilbert {
244329c9b10SDr. David Alan Gilbert union { uint32_t lv[2]; uint64_t llv; } u;
245329c9b10SDr. David Alan Gilbert u.lv[0] = htonl(v >> 32);
246329c9b10SDr. David Alan Gilbert u.lv[1] = htonl(v & 0xFFFFFFFFULL);
247329c9b10SDr. David Alan Gilbert return u.llv;
248329c9b10SDr. David Alan Gilbert }
24944ce1b5dSNick Briggs #endif
250329c9b10SDr. David Alan Gilbert
25144ce1b5dSNick Briggs #if !defined(ntohll)
ntohll(uint64_t v)252cbfc71b5SBihong Yu static uint64_t ntohll(uint64_t v)
253cbfc71b5SBihong Yu {
254329c9b10SDr. David Alan Gilbert union { uint32_t lv[2]; uint64_t llv; } u;
255329c9b10SDr. David Alan Gilbert u.llv = v;
256329c9b10SDr. David Alan Gilbert return ((uint64_t)ntohl(u.lv[0]) << 32) | (uint64_t) ntohl(u.lv[1]);
257329c9b10SDr. David Alan Gilbert }
25844ce1b5dSNick Briggs #endif
259329c9b10SDr. David Alan Gilbert
dest_block_to_network(RDMADestBlock * db)260a97270adSDr. David Alan Gilbert static void dest_block_to_network(RDMADestBlock *db)
261329c9b10SDr. David Alan Gilbert {
262a97270adSDr. David Alan Gilbert db->remote_host_addr = htonll(db->remote_host_addr);
263a97270adSDr. David Alan Gilbert db->offset = htonll(db->offset);
264a97270adSDr. David Alan Gilbert db->length = htonll(db->length);
265a97270adSDr. David Alan Gilbert db->remote_rkey = htonl(db->remote_rkey);
266329c9b10SDr. David Alan Gilbert }
267329c9b10SDr. David Alan Gilbert
network_to_dest_block(RDMADestBlock * db)268a97270adSDr. David Alan Gilbert static void network_to_dest_block(RDMADestBlock *db)
269329c9b10SDr. David Alan Gilbert {
270a97270adSDr. David Alan Gilbert db->remote_host_addr = ntohll(db->remote_host_addr);
271a97270adSDr. David Alan Gilbert db->offset = ntohll(db->offset);
272a97270adSDr. David Alan Gilbert db->length = ntohll(db->length);
273a97270adSDr. David Alan Gilbert db->remote_rkey = ntohl(db->remote_rkey);
274329c9b10SDr. David Alan Gilbert }
275329c9b10SDr. David Alan Gilbert
276329c9b10SDr. David Alan Gilbert /*
277329c9b10SDr. David Alan Gilbert * Virtual address of the above structures used for transmitting
278329c9b10SDr. David Alan Gilbert * the RAMBlock descriptions at connection-time.
279329c9b10SDr. David Alan Gilbert * This structure is *not* transmitted.
280329c9b10SDr. David Alan Gilbert */
281329c9b10SDr. David Alan Gilbert typedef struct RDMALocalBlocks {
282329c9b10SDr. David Alan Gilbert int nb_blocks;
283329c9b10SDr. David Alan Gilbert bool init; /* main memory init complete */
284329c9b10SDr. David Alan Gilbert RDMALocalBlock *block;
285329c9b10SDr. David Alan Gilbert } RDMALocalBlocks;
286329c9b10SDr. David Alan Gilbert
287329c9b10SDr. David Alan Gilbert /*
288329c9b10SDr. David Alan Gilbert * Main data structure for RDMA state.
289329c9b10SDr. David Alan Gilbert * While there is only one copy of this structure being allocated right now,
290329c9b10SDr. David Alan Gilbert * this is the place where one would start if you wanted to consider
291329c9b10SDr. David Alan Gilbert * having more than one RDMA connection open at the same time.
292329c9b10SDr. David Alan Gilbert */
293329c9b10SDr. David Alan Gilbert typedef struct RDMAContext {
294329c9b10SDr. David Alan Gilbert char *host;
295329c9b10SDr. David Alan Gilbert int port;
296329c9b10SDr. David Alan Gilbert
297329c9b10SDr. David Alan Gilbert RDMAWorkRequestData wr_data[RDMA_WRID_MAX];
298329c9b10SDr. David Alan Gilbert
299329c9b10SDr. David Alan Gilbert /*
300329c9b10SDr. David Alan Gilbert * This is used by *_exchange_send() to figure out whether or not
301329c9b10SDr. David Alan Gilbert * the initial "READY" message has already been received or not.
302329c9b10SDr. David Alan Gilbert * This is because other functions may potentially poll() and detect
303329c9b10SDr. David Alan Gilbert * the READY message before send() does, in which case we need to
304329c9b10SDr. David Alan Gilbert * know if it completed.
305329c9b10SDr. David Alan Gilbert */
306329c9b10SDr. David Alan Gilbert int control_ready_expected;
307329c9b10SDr. David Alan Gilbert
308329c9b10SDr. David Alan Gilbert /* number of outstanding writes */
309329c9b10SDr. David Alan Gilbert int nb_sent;
310329c9b10SDr. David Alan Gilbert
311329c9b10SDr. David Alan Gilbert /* store info about current buffer so that we can
312329c9b10SDr. David Alan Gilbert merge it with future sends */
313329c9b10SDr. David Alan Gilbert uint64_t current_addr;
314329c9b10SDr. David Alan Gilbert uint64_t current_length;
315329c9b10SDr. David Alan Gilbert /* index of ram block the current buffer belongs to */
316329c9b10SDr. David Alan Gilbert int current_index;
317329c9b10SDr. David Alan Gilbert /* index of the chunk in the current ram block */
318329c9b10SDr. David Alan Gilbert int current_chunk;
319329c9b10SDr. David Alan Gilbert
320329c9b10SDr. David Alan Gilbert bool pin_all;
321329c9b10SDr. David Alan Gilbert
322329c9b10SDr. David Alan Gilbert /*
323329c9b10SDr. David Alan Gilbert * infiniband-specific variables for opening the device
324329c9b10SDr. David Alan Gilbert * and maintaining connection state and so forth.
325329c9b10SDr. David Alan Gilbert *
326329c9b10SDr. David Alan Gilbert * cm_id also has ibv_context, rdma_event_channel, and ibv_qp in
327329c9b10SDr. David Alan Gilbert * cm_id->verbs, cm_id->channel, and cm_id->qp.
328329c9b10SDr. David Alan Gilbert */
329329c9b10SDr. David Alan Gilbert struct rdma_cm_id *cm_id; /* connection manager ID */
330329c9b10SDr. David Alan Gilbert struct rdma_cm_id *listen_id;
331329c9b10SDr. David Alan Gilbert bool connected;
332329c9b10SDr. David Alan Gilbert
333329c9b10SDr. David Alan Gilbert struct ibv_context *verbs;
334329c9b10SDr. David Alan Gilbert struct rdma_event_channel *channel;
335329c9b10SDr. David Alan Gilbert struct ibv_qp *qp; /* queue pair */
336b390afd8SLi Zhijian struct ibv_comp_channel *recv_comp_channel; /* recv completion channel */
337b390afd8SLi Zhijian struct ibv_comp_channel *send_comp_channel; /* send completion channel */
338329c9b10SDr. David Alan Gilbert struct ibv_pd *pd; /* protection domain */
339b390afd8SLi Zhijian struct ibv_cq *recv_cq; /* recvieve completion queue */
340b390afd8SLi Zhijian struct ibv_cq *send_cq; /* send completion queue */
341329c9b10SDr. David Alan Gilbert
342329c9b10SDr. David Alan Gilbert /*
343329c9b10SDr. David Alan Gilbert * If a previous write failed (perhaps because of a failed
344329c9b10SDr. David Alan Gilbert * memory registration, then do not attempt any future work
345329c9b10SDr. David Alan Gilbert * and remember the error state.
346329c9b10SDr. David Alan Gilbert */
347b86c94a4SMarkus Armbruster bool errored;
34889997ac3SMarkus Armbruster bool error_reported;
34989997ac3SMarkus Armbruster bool received_error;
350329c9b10SDr. David Alan Gilbert
351329c9b10SDr. David Alan Gilbert /*
352329c9b10SDr. David Alan Gilbert * Description of ram blocks used throughout the code.
353329c9b10SDr. David Alan Gilbert */
354329c9b10SDr. David Alan Gilbert RDMALocalBlocks local_ram_blocks;
355a97270adSDr. David Alan Gilbert RDMADestBlock *dest_blocks;
356329c9b10SDr. David Alan Gilbert
357e4d63320SDr. David Alan Gilbert /* Index of the next RAMBlock received during block registration */
358e4d63320SDr. David Alan Gilbert unsigned int next_src_index;
359e4d63320SDr. David Alan Gilbert
360329c9b10SDr. David Alan Gilbert /*
361329c9b10SDr. David Alan Gilbert * Migration on *destination* started.
362329c9b10SDr. David Alan Gilbert * Then use coroutine yield function.
363329c9b10SDr. David Alan Gilbert * Source runs in a thread, so we don't care.
364329c9b10SDr. David Alan Gilbert */
365329c9b10SDr. David Alan Gilbert int migration_started_on_destination;
366329c9b10SDr. David Alan Gilbert
367329c9b10SDr. David Alan Gilbert int total_registrations;
368329c9b10SDr. David Alan Gilbert int total_writes;
369329c9b10SDr. David Alan Gilbert
370329c9b10SDr. David Alan Gilbert int unregister_current, unregister_next;
371329c9b10SDr. David Alan Gilbert uint64_t unregistrations[RDMA_SIGNALED_SEND_MAX];
372329c9b10SDr. David Alan Gilbert
373329c9b10SDr. David Alan Gilbert GHashTable *blockmap;
37455cc1b59SLidong Chen
37555cc1b59SLidong Chen /* the RDMAContext for return path */
37655cc1b59SLidong Chen struct RDMAContext *return_path;
37755cc1b59SLidong Chen bool is_return_path;
378329c9b10SDr. David Alan Gilbert } RDMAContext;
379329c9b10SDr. David Alan Gilbert
3806ddd2d76SDaniel P. Berrange #define TYPE_QIO_CHANNEL_RDMA "qio-channel-rdma"
3818063396bSEduardo Habkost OBJECT_DECLARE_SIMPLE_TYPE(QIOChannelRDMA, QIO_CHANNEL_RDMA)
3826ddd2d76SDaniel P. Berrange
3836ddd2d76SDaniel P. Berrange
3846ddd2d76SDaniel P. Berrange
3856ddd2d76SDaniel P. Berrange struct QIOChannelRDMA {
3866ddd2d76SDaniel P. Berrange QIOChannel parent;
38774637e6fSLidong Chen RDMAContext *rdmain;
38874637e6fSLidong Chen RDMAContext *rdmaout;
3896ddd2d76SDaniel P. Berrange QEMUFile *file;
3906ddd2d76SDaniel P. Berrange bool blocking; /* XXX we don't actually honour this yet */
3916ddd2d76SDaniel P. Berrange };
392329c9b10SDr. David Alan Gilbert
393329c9b10SDr. David Alan Gilbert /*
394329c9b10SDr. David Alan Gilbert * Main structure for IB Send/Recv control messages.
395329c9b10SDr. David Alan Gilbert * This gets prepended at the beginning of every Send/Recv.
396329c9b10SDr. David Alan Gilbert */
397329c9b10SDr. David Alan Gilbert typedef struct QEMU_PACKED {
398329c9b10SDr. David Alan Gilbert uint32_t len; /* Total length of data portion */
399329c9b10SDr. David Alan Gilbert uint32_t type; /* which control command to perform */
400329c9b10SDr. David Alan Gilbert uint32_t repeat; /* number of commands in data portion of same type */
401329c9b10SDr. David Alan Gilbert uint32_t padding;
402329c9b10SDr. David Alan Gilbert } RDMAControlHeader;
403329c9b10SDr. David Alan Gilbert
control_to_network(RDMAControlHeader * control)404329c9b10SDr. David Alan Gilbert static void control_to_network(RDMAControlHeader *control)
405329c9b10SDr. David Alan Gilbert {
406329c9b10SDr. David Alan Gilbert control->type = htonl(control->type);
407329c9b10SDr. David Alan Gilbert control->len = htonl(control->len);
408329c9b10SDr. David Alan Gilbert control->repeat = htonl(control->repeat);
409329c9b10SDr. David Alan Gilbert }
410329c9b10SDr. David Alan Gilbert
network_to_control(RDMAControlHeader * control)411329c9b10SDr. David Alan Gilbert static void network_to_control(RDMAControlHeader *control)
412329c9b10SDr. David Alan Gilbert {
413329c9b10SDr. David Alan Gilbert control->type = ntohl(control->type);
414329c9b10SDr. David Alan Gilbert control->len = ntohl(control->len);
415329c9b10SDr. David Alan Gilbert control->repeat = ntohl(control->repeat);
416329c9b10SDr. David Alan Gilbert }
417329c9b10SDr. David Alan Gilbert
418329c9b10SDr. David Alan Gilbert /*
419329c9b10SDr. David Alan Gilbert * Register a single Chunk.
420329c9b10SDr. David Alan Gilbert * Information sent by the source VM to inform the dest
421329c9b10SDr. David Alan Gilbert * to register an single chunk of memory before we can perform
422329c9b10SDr. David Alan Gilbert * the actual RDMA operation.
423329c9b10SDr. David Alan Gilbert */
424329c9b10SDr. David Alan Gilbert typedef struct QEMU_PACKED {
425329c9b10SDr. David Alan Gilbert union QEMU_PACKED {
426b12f7777SDr. David Alan Gilbert uint64_t current_addr; /* offset into the ram_addr_t space */
427329c9b10SDr. David Alan Gilbert uint64_t chunk; /* chunk to lookup if unregistering */
428329c9b10SDr. David Alan Gilbert } key;
429329c9b10SDr. David Alan Gilbert uint32_t current_index; /* which ramblock the chunk belongs to */
430329c9b10SDr. David Alan Gilbert uint32_t padding;
431329c9b10SDr. David Alan Gilbert uint64_t chunks; /* how many sequential chunks to register */
432329c9b10SDr. David Alan Gilbert } RDMARegister;
433329c9b10SDr. David Alan Gilbert
rdma_errored(RDMAContext * rdma)434b86c94a4SMarkus Armbruster static bool rdma_errored(RDMAContext *rdma)
435de3e05e8SMarkus Armbruster {
436b86c94a4SMarkus Armbruster if (rdma->errored && !rdma->error_reported) {
437de3e05e8SMarkus Armbruster error_report("RDMA is in an error state waiting migration"
438de3e05e8SMarkus Armbruster " to abort!");
439de3e05e8SMarkus Armbruster rdma->error_reported = true;
440de3e05e8SMarkus Armbruster }
441b86c94a4SMarkus Armbruster return rdma->errored;
442de3e05e8SMarkus Armbruster }
443de3e05e8SMarkus Armbruster
register_to_network(RDMAContext * rdma,RDMARegister * reg)444b12f7777SDr. David Alan Gilbert static void register_to_network(RDMAContext *rdma, RDMARegister *reg)
445329c9b10SDr. David Alan Gilbert {
446b12f7777SDr. David Alan Gilbert RDMALocalBlock *local_block;
447b12f7777SDr. David Alan Gilbert local_block = &rdma->local_ram_blocks.block[reg->current_index];
448b12f7777SDr. David Alan Gilbert
449b12f7777SDr. David Alan Gilbert if (local_block->is_ram_block) {
450b12f7777SDr. David Alan Gilbert /*
451b12f7777SDr. David Alan Gilbert * current_addr as passed in is an address in the local ram_addr_t
452b12f7777SDr. David Alan Gilbert * space, we need to translate this for the destination
453b12f7777SDr. David Alan Gilbert */
454b12f7777SDr. David Alan Gilbert reg->key.current_addr -= local_block->offset;
455b12f7777SDr. David Alan Gilbert reg->key.current_addr += rdma->dest_blocks[reg->current_index].offset;
456b12f7777SDr. David Alan Gilbert }
457329c9b10SDr. David Alan Gilbert reg->key.current_addr = htonll(reg->key.current_addr);
458329c9b10SDr. David Alan Gilbert reg->current_index = htonl(reg->current_index);
459329c9b10SDr. David Alan Gilbert reg->chunks = htonll(reg->chunks);
460329c9b10SDr. David Alan Gilbert }
461329c9b10SDr. David Alan Gilbert
network_to_register(RDMARegister * reg)462329c9b10SDr. David Alan Gilbert static void network_to_register(RDMARegister *reg)
463329c9b10SDr. David Alan Gilbert {
464329c9b10SDr. David Alan Gilbert reg->key.current_addr = ntohll(reg->key.current_addr);
465329c9b10SDr. David Alan Gilbert reg->current_index = ntohl(reg->current_index);
466329c9b10SDr. David Alan Gilbert reg->chunks = ntohll(reg->chunks);
467329c9b10SDr. David Alan Gilbert }
468329c9b10SDr. David Alan Gilbert
469329c9b10SDr. David Alan Gilbert typedef struct QEMU_PACKED {
470329c9b10SDr. David Alan Gilbert uint32_t value; /* if zero, we will madvise() */
471329c9b10SDr. David Alan Gilbert uint32_t block_idx; /* which ram block index */
472b12f7777SDr. David Alan Gilbert uint64_t offset; /* Address in remote ram_addr_t space */
473329c9b10SDr. David Alan Gilbert uint64_t length; /* length of the chunk */
474329c9b10SDr. David Alan Gilbert } RDMACompress;
475329c9b10SDr. David Alan Gilbert
compress_to_network(RDMAContext * rdma,RDMACompress * comp)476b12f7777SDr. David Alan Gilbert static void compress_to_network(RDMAContext *rdma, RDMACompress *comp)
477329c9b10SDr. David Alan Gilbert {
478329c9b10SDr. David Alan Gilbert comp->value = htonl(comp->value);
479b12f7777SDr. David Alan Gilbert /*
480b12f7777SDr. David Alan Gilbert * comp->offset as passed in is an address in the local ram_addr_t
481b12f7777SDr. David Alan Gilbert * space, we need to translate this for the destination
482b12f7777SDr. David Alan Gilbert */
483b12f7777SDr. David Alan Gilbert comp->offset -= rdma->local_ram_blocks.block[comp->block_idx].offset;
484b12f7777SDr. David Alan Gilbert comp->offset += rdma->dest_blocks[comp->block_idx].offset;
485329c9b10SDr. David Alan Gilbert comp->block_idx = htonl(comp->block_idx);
486329c9b10SDr. David Alan Gilbert comp->offset = htonll(comp->offset);
487329c9b10SDr. David Alan Gilbert comp->length = htonll(comp->length);
488329c9b10SDr. David Alan Gilbert }
489329c9b10SDr. David Alan Gilbert
network_to_compress(RDMACompress * comp)490329c9b10SDr. David Alan Gilbert static void network_to_compress(RDMACompress *comp)
491329c9b10SDr. David Alan Gilbert {
492329c9b10SDr. David Alan Gilbert comp->value = ntohl(comp->value);
493329c9b10SDr. David Alan Gilbert comp->block_idx = ntohl(comp->block_idx);
494329c9b10SDr. David Alan Gilbert comp->offset = ntohll(comp->offset);
495329c9b10SDr. David Alan Gilbert comp->length = ntohll(comp->length);
496329c9b10SDr. David Alan Gilbert }
497329c9b10SDr. David Alan Gilbert
498329c9b10SDr. David Alan Gilbert /*
499329c9b10SDr. David Alan Gilbert * The result of the dest's memory registration produces an "rkey"
500329c9b10SDr. David Alan Gilbert * which the source VM must reference in order to perform
501329c9b10SDr. David Alan Gilbert * the RDMA operation.
502329c9b10SDr. David Alan Gilbert */
503329c9b10SDr. David Alan Gilbert typedef struct QEMU_PACKED {
504329c9b10SDr. David Alan Gilbert uint32_t rkey;
505329c9b10SDr. David Alan Gilbert uint32_t padding;
506329c9b10SDr. David Alan Gilbert uint64_t host_addr;
507329c9b10SDr. David Alan Gilbert } RDMARegisterResult;
508329c9b10SDr. David Alan Gilbert
result_to_network(RDMARegisterResult * result)509329c9b10SDr. David Alan Gilbert static void result_to_network(RDMARegisterResult *result)
510329c9b10SDr. David Alan Gilbert {
511329c9b10SDr. David Alan Gilbert result->rkey = htonl(result->rkey);
512329c9b10SDr. David Alan Gilbert result->host_addr = htonll(result->host_addr);
513329c9b10SDr. David Alan Gilbert };
514329c9b10SDr. David Alan Gilbert
network_to_result(RDMARegisterResult * result)515329c9b10SDr. David Alan Gilbert static void network_to_result(RDMARegisterResult *result)
516329c9b10SDr. David Alan Gilbert {
517329c9b10SDr. David Alan Gilbert result->rkey = ntohl(result->rkey);
518329c9b10SDr. David Alan Gilbert result->host_addr = ntohll(result->host_addr);
519329c9b10SDr. David Alan Gilbert };
520329c9b10SDr. David Alan Gilbert
521329c9b10SDr. David Alan Gilbert static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
522329c9b10SDr. David Alan Gilbert uint8_t *data, RDMAControlHeader *resp,
523329c9b10SDr. David Alan Gilbert int *resp_idx,
524de1aa35fSMarkus Armbruster int (*callback)(RDMAContext *rdma,
525de1aa35fSMarkus Armbruster Error **errp),
526c4c78dceSMarkus Armbruster Error **errp);
527329c9b10SDr. David Alan Gilbert
ram_chunk_index(const uint8_t * start,const uint8_t * host)528329c9b10SDr. David Alan Gilbert static inline uint64_t ram_chunk_index(const uint8_t *start,
529329c9b10SDr. David Alan Gilbert const uint8_t *host)
530329c9b10SDr. David Alan Gilbert {
531329c9b10SDr. David Alan Gilbert return ((uintptr_t) host - (uintptr_t) start) >> RDMA_REG_CHUNK_SHIFT;
532329c9b10SDr. David Alan Gilbert }
533329c9b10SDr. David Alan Gilbert
ram_chunk_start(const RDMALocalBlock * rdma_ram_block,uint64_t i)534329c9b10SDr. David Alan Gilbert static inline uint8_t *ram_chunk_start(const RDMALocalBlock *rdma_ram_block,
535329c9b10SDr. David Alan Gilbert uint64_t i)
536329c9b10SDr. David Alan Gilbert {
537fbce8c25SStefan Weil return (uint8_t *)(uintptr_t)(rdma_ram_block->local_host_addr +
538fbce8c25SStefan Weil (i << RDMA_REG_CHUNK_SHIFT));
539329c9b10SDr. David Alan Gilbert }
540329c9b10SDr. David Alan Gilbert
ram_chunk_end(const RDMALocalBlock * rdma_ram_block,uint64_t i)541329c9b10SDr. David Alan Gilbert static inline uint8_t *ram_chunk_end(const RDMALocalBlock *rdma_ram_block,
542329c9b10SDr. David Alan Gilbert uint64_t i)
543329c9b10SDr. David Alan Gilbert {
544329c9b10SDr. David Alan Gilbert uint8_t *result = ram_chunk_start(rdma_ram_block, i) +
545329c9b10SDr. David Alan Gilbert (1UL << RDMA_REG_CHUNK_SHIFT);
546329c9b10SDr. David Alan Gilbert
547329c9b10SDr. David Alan Gilbert if (result > (rdma_ram_block->local_host_addr + rdma_ram_block->length)) {
548329c9b10SDr. David Alan Gilbert result = rdma_ram_block->local_host_addr + rdma_ram_block->length;
549329c9b10SDr. David Alan Gilbert }
550329c9b10SDr. David Alan Gilbert
551329c9b10SDr. David Alan Gilbert return result;
552329c9b10SDr. David Alan Gilbert }
553329c9b10SDr. David Alan Gilbert
rdma_add_block(RDMAContext * rdma,const char * block_name,void * host_addr,ram_addr_t block_offset,uint64_t length)5540610d7a1SMarkus Armbruster static void rdma_add_block(RDMAContext *rdma, const char *block_name,
5554fb5364bSDr. David Alan Gilbert void *host_addr,
556329c9b10SDr. David Alan Gilbert ram_addr_t block_offset, uint64_t length)
557329c9b10SDr. David Alan Gilbert {
558329c9b10SDr. David Alan Gilbert RDMALocalBlocks *local = &rdma->local_ram_blocks;
559760ff4beSDr. David Alan Gilbert RDMALocalBlock *block;
560329c9b10SDr. David Alan Gilbert RDMALocalBlock *old = local->block;
561329c9b10SDr. David Alan Gilbert
56297f3ad35SMarkus Armbruster local->block = g_new0(RDMALocalBlock, local->nb_blocks + 1);
563329c9b10SDr. David Alan Gilbert
564329c9b10SDr. David Alan Gilbert if (local->nb_blocks) {
565760ff4beSDr. David Alan Gilbert if (rdma->blockmap) {
56614e2fcbbSJuan Quintela for (int x = 0; x < local->nb_blocks; x++) {
567fbce8c25SStefan Weil g_hash_table_remove(rdma->blockmap,
568fbce8c25SStefan Weil (void *)(uintptr_t)old[x].offset);
569fbce8c25SStefan Weil g_hash_table_insert(rdma->blockmap,
570fbce8c25SStefan Weil (void *)(uintptr_t)old[x].offset,
571329c9b10SDr. David Alan Gilbert &local->block[x]);
572329c9b10SDr. David Alan Gilbert }
573760ff4beSDr. David Alan Gilbert }
574329c9b10SDr. David Alan Gilbert memcpy(local->block, old, sizeof(RDMALocalBlock) * local->nb_blocks);
575329c9b10SDr. David Alan Gilbert g_free(old);
576329c9b10SDr. David Alan Gilbert }
577329c9b10SDr. David Alan Gilbert
578329c9b10SDr. David Alan Gilbert block = &local->block[local->nb_blocks];
579329c9b10SDr. David Alan Gilbert
5804fb5364bSDr. David Alan Gilbert block->block_name = g_strdup(block_name);
581329c9b10SDr. David Alan Gilbert block->local_host_addr = host_addr;
582329c9b10SDr. David Alan Gilbert block->offset = block_offset;
583329c9b10SDr. David Alan Gilbert block->length = length;
584329c9b10SDr. David Alan Gilbert block->index = local->nb_blocks;
585e4d63320SDr. David Alan Gilbert block->src_index = ~0U; /* Filled in by the receipt of the block list */
586329c9b10SDr. David Alan Gilbert block->nb_chunks = ram_chunk_index(host_addr, host_addr + length) + 1UL;
587329c9b10SDr. David Alan Gilbert block->transit_bitmap = bitmap_new(block->nb_chunks);
588329c9b10SDr. David Alan Gilbert bitmap_clear(block->transit_bitmap, 0, block->nb_chunks);
589329c9b10SDr. David Alan Gilbert block->unregister_bitmap = bitmap_new(block->nb_chunks);
590329c9b10SDr. David Alan Gilbert bitmap_clear(block->unregister_bitmap, 0, block->nb_chunks);
59197f3ad35SMarkus Armbruster block->remote_keys = g_new0(uint32_t, block->nb_chunks);
592329c9b10SDr. David Alan Gilbert
593329c9b10SDr. David Alan Gilbert block->is_ram_block = local->init ? false : true;
594329c9b10SDr. David Alan Gilbert
595760ff4beSDr. David Alan Gilbert if (rdma->blockmap) {
59680e60c6eSJuan Quintela g_hash_table_insert(rdma->blockmap, (void *)(uintptr_t)block_offset, block);
597760ff4beSDr. David Alan Gilbert }
598329c9b10SDr. David Alan Gilbert
5994fb5364bSDr. David Alan Gilbert trace_rdma_add_block(block_name, local->nb_blocks,
6004fb5364bSDr. David Alan Gilbert (uintptr_t) block->local_host_addr,
601ba795761SDr. David Alan Gilbert block->offset, block->length,
602fbce8c25SStefan Weil (uintptr_t) (block->local_host_addr + block->length),
603329c9b10SDr. David Alan Gilbert BITS_TO_LONGS(block->nb_chunks) *
604733252deSDr. David Alan Gilbert sizeof(unsigned long) * 8,
605733252deSDr. David Alan Gilbert block->nb_chunks);
606329c9b10SDr. David Alan Gilbert
607329c9b10SDr. David Alan Gilbert local->nb_blocks++;
608329c9b10SDr. David Alan Gilbert }
609329c9b10SDr. David Alan Gilbert
610329c9b10SDr. David Alan Gilbert /*
611329c9b10SDr. David Alan Gilbert * Memory regions need to be registered with the device and queue pairs setup
612329c9b10SDr. David Alan Gilbert * in advanced before the migration starts. This tells us where the RAM blocks
613329c9b10SDr. David Alan Gilbert * are so that we can register them individually.
614329c9b10SDr. David Alan Gilbert */
qemu_rdma_init_one_block(RAMBlock * rb,void * opaque)615754cb9c0SYury Kotov static int qemu_rdma_init_one_block(RAMBlock *rb, void *opaque)
616329c9b10SDr. David Alan Gilbert {
617754cb9c0SYury Kotov const char *block_name = qemu_ram_get_idstr(rb);
618754cb9c0SYury Kotov void *host_addr = qemu_ram_get_host_addr(rb);
619754cb9c0SYury Kotov ram_addr_t block_offset = qemu_ram_get_offset(rb);
620754cb9c0SYury Kotov ram_addr_t length = qemu_ram_get_used_length(rb);
6210610d7a1SMarkus Armbruster rdma_add_block(opaque, block_name, host_addr, block_offset, length);
6220610d7a1SMarkus Armbruster return 0;
623329c9b10SDr. David Alan Gilbert }
624329c9b10SDr. David Alan Gilbert
625329c9b10SDr. David Alan Gilbert /*
626329c9b10SDr. David Alan Gilbert * Identify the RAMBlocks and their quantity. They will be references to
627329c9b10SDr. David Alan Gilbert * identify chunk boundaries inside each RAMBlock and also be referenced
628329c9b10SDr. David Alan Gilbert * during dynamic page registration.
629329c9b10SDr. David Alan Gilbert */
qemu_rdma_init_ram_blocks(RDMAContext * rdma)6300610d7a1SMarkus Armbruster static void qemu_rdma_init_ram_blocks(RDMAContext *rdma)
631329c9b10SDr. David Alan Gilbert {
632329c9b10SDr. David Alan Gilbert RDMALocalBlocks *local = &rdma->local_ram_blocks;
633281496bbSDr. David Alan Gilbert int ret;
634329c9b10SDr. David Alan Gilbert
635329c9b10SDr. David Alan Gilbert assert(rdma->blockmap == NULL);
636329c9b10SDr. David Alan Gilbert memset(local, 0, sizeof *local);
637281496bbSDr. David Alan Gilbert ret = foreach_not_ignored_block(qemu_rdma_init_one_block, rdma);
6380610d7a1SMarkus Armbruster assert(!ret);
639733252deSDr. David Alan Gilbert trace_qemu_rdma_init_ram_blocks(local->nb_blocks);
64097f3ad35SMarkus Armbruster rdma->dest_blocks = g_new0(RDMADestBlock,
641329c9b10SDr. David Alan Gilbert rdma->local_ram_blocks.nb_blocks);
642329c9b10SDr. David Alan Gilbert local->init = true;
643329c9b10SDr. David Alan Gilbert }
644329c9b10SDr. David Alan Gilbert
64503fcab38SDr. David Alan Gilbert /*
64603fcab38SDr. David Alan Gilbert * Note: If used outside of cleanup, the caller must ensure that the destination
64703fcab38SDr. David Alan Gilbert * block structures are also updated
64803fcab38SDr. David Alan Gilbert */
rdma_delete_block(RDMAContext * rdma,RDMALocalBlock * block)6491720a2a8SMarkus Armbruster static void rdma_delete_block(RDMAContext *rdma, RDMALocalBlock *block)
650329c9b10SDr. David Alan Gilbert {
651329c9b10SDr. David Alan Gilbert RDMALocalBlocks *local = &rdma->local_ram_blocks;
652329c9b10SDr. David Alan Gilbert RDMALocalBlock *old = local->block;
653329c9b10SDr. David Alan Gilbert
65403fcab38SDr. David Alan Gilbert if (rdma->blockmap) {
65503fcab38SDr. David Alan Gilbert g_hash_table_remove(rdma->blockmap, (void *)(uintptr_t)block->offset);
65603fcab38SDr. David Alan Gilbert }
657329c9b10SDr. David Alan Gilbert if (block->pmr) {
65814e2fcbbSJuan Quintela for (int j = 0; j < block->nb_chunks; j++) {
659329c9b10SDr. David Alan Gilbert if (!block->pmr[j]) {
660329c9b10SDr. David Alan Gilbert continue;
661329c9b10SDr. David Alan Gilbert }
662329c9b10SDr. David Alan Gilbert ibv_dereg_mr(block->pmr[j]);
663329c9b10SDr. David Alan Gilbert rdma->total_registrations--;
664329c9b10SDr. David Alan Gilbert }
665329c9b10SDr. David Alan Gilbert g_free(block->pmr);
666329c9b10SDr. David Alan Gilbert block->pmr = NULL;
667329c9b10SDr. David Alan Gilbert }
668329c9b10SDr. David Alan Gilbert
669329c9b10SDr. David Alan Gilbert if (block->mr) {
670329c9b10SDr. David Alan Gilbert ibv_dereg_mr(block->mr);
671329c9b10SDr. David Alan Gilbert rdma->total_registrations--;
672329c9b10SDr. David Alan Gilbert block->mr = NULL;
673329c9b10SDr. David Alan Gilbert }
674329c9b10SDr. David Alan Gilbert
675329c9b10SDr. David Alan Gilbert g_free(block->transit_bitmap);
676329c9b10SDr. David Alan Gilbert block->transit_bitmap = NULL;
677329c9b10SDr. David Alan Gilbert
678329c9b10SDr. David Alan Gilbert g_free(block->unregister_bitmap);
679329c9b10SDr. David Alan Gilbert block->unregister_bitmap = NULL;
680329c9b10SDr. David Alan Gilbert
681329c9b10SDr. David Alan Gilbert g_free(block->remote_keys);
682329c9b10SDr. David Alan Gilbert block->remote_keys = NULL;
683329c9b10SDr. David Alan Gilbert
6844fb5364bSDr. David Alan Gilbert g_free(block->block_name);
6854fb5364bSDr. David Alan Gilbert block->block_name = NULL;
6864fb5364bSDr. David Alan Gilbert
68703fcab38SDr. David Alan Gilbert if (rdma->blockmap) {
68814e2fcbbSJuan Quintela for (int x = 0; x < local->nb_blocks; x++) {
68903fcab38SDr. David Alan Gilbert g_hash_table_remove(rdma->blockmap,
69003fcab38SDr. David Alan Gilbert (void *)(uintptr_t)old[x].offset);
69103fcab38SDr. David Alan Gilbert }
692329c9b10SDr. David Alan Gilbert }
693329c9b10SDr. David Alan Gilbert
694329c9b10SDr. David Alan Gilbert if (local->nb_blocks > 1) {
695329c9b10SDr. David Alan Gilbert
69697f3ad35SMarkus Armbruster local->block = g_new0(RDMALocalBlock, local->nb_blocks - 1);
697329c9b10SDr. David Alan Gilbert
698329c9b10SDr. David Alan Gilbert if (block->index) {
699329c9b10SDr. David Alan Gilbert memcpy(local->block, old, sizeof(RDMALocalBlock) * block->index);
700329c9b10SDr. David Alan Gilbert }
701329c9b10SDr. David Alan Gilbert
702329c9b10SDr. David Alan Gilbert if (block->index < (local->nb_blocks - 1)) {
703329c9b10SDr. David Alan Gilbert memcpy(local->block + block->index, old + (block->index + 1),
704329c9b10SDr. David Alan Gilbert sizeof(RDMALocalBlock) *
705329c9b10SDr. David Alan Gilbert (local->nb_blocks - (block->index + 1)));
70614e2fcbbSJuan Quintela for (int x = block->index; x < local->nb_blocks - 1; x++) {
70771cd7306SLidong Chen local->block[x].index--;
70871cd7306SLidong Chen }
709329c9b10SDr. David Alan Gilbert }
710329c9b10SDr. David Alan Gilbert } else {
711329c9b10SDr. David Alan Gilbert assert(block == local->block);
712329c9b10SDr. David Alan Gilbert local->block = NULL;
713329c9b10SDr. David Alan Gilbert }
714329c9b10SDr. David Alan Gilbert
71503fcab38SDr. David Alan Gilbert trace_rdma_delete_block(block, (uintptr_t)block->local_host_addr,
716733252deSDr. David Alan Gilbert block->offset, block->length,
717fbce8c25SStefan Weil (uintptr_t)(block->local_host_addr + block->length),
718329c9b10SDr. David Alan Gilbert BITS_TO_LONGS(block->nb_chunks) *
719329c9b10SDr. David Alan Gilbert sizeof(unsigned long) * 8, block->nb_chunks);
720329c9b10SDr. David Alan Gilbert
721329c9b10SDr. David Alan Gilbert g_free(old);
722329c9b10SDr. David Alan Gilbert
723329c9b10SDr. David Alan Gilbert local->nb_blocks--;
724329c9b10SDr. David Alan Gilbert
72503fcab38SDr. David Alan Gilbert if (local->nb_blocks && rdma->blockmap) {
72614e2fcbbSJuan Quintela for (int x = 0; x < local->nb_blocks; x++) {
727fbce8c25SStefan Weil g_hash_table_insert(rdma->blockmap,
728fbce8c25SStefan Weil (void *)(uintptr_t)local->block[x].offset,
729329c9b10SDr. David Alan Gilbert &local->block[x]);
730329c9b10SDr. David Alan Gilbert }
731329c9b10SDr. David Alan Gilbert }
732329c9b10SDr. David Alan Gilbert }
733329c9b10SDr. David Alan Gilbert
734329c9b10SDr. David Alan Gilbert /*
7352c88739cSMarkus Armbruster * Trace RDMA device open, with device details.
736329c9b10SDr. David Alan Gilbert */
qemu_rdma_dump_id(const char * who,struct ibv_context * verbs)737329c9b10SDr. David Alan Gilbert static void qemu_rdma_dump_id(const char *who, struct ibv_context *verbs)
738329c9b10SDr. David Alan Gilbert {
739329c9b10SDr. David Alan Gilbert struct ibv_port_attr port;
740329c9b10SDr. David Alan Gilbert
741329c9b10SDr. David Alan Gilbert if (ibv_query_port(verbs, 1, &port)) {
7422c88739cSMarkus Armbruster trace_qemu_rdma_dump_id_failed(who);
743329c9b10SDr. David Alan Gilbert return;
744329c9b10SDr. David Alan Gilbert }
745329c9b10SDr. David Alan Gilbert
7462c88739cSMarkus Armbruster trace_qemu_rdma_dump_id(who,
747329c9b10SDr. David Alan Gilbert verbs->device->name,
748329c9b10SDr. David Alan Gilbert verbs->device->dev_name,
749329c9b10SDr. David Alan Gilbert verbs->device->dev_path,
750329c9b10SDr. David Alan Gilbert verbs->device->ibdev_path,
751329c9b10SDr. David Alan Gilbert port.link_layer,
7522c88739cSMarkus Armbruster port.link_layer == IBV_LINK_LAYER_INFINIBAND ? "Infiniband"
7532c88739cSMarkus Armbruster : port.link_layer == IBV_LINK_LAYER_ETHERNET ? "Ethernet"
7542c88739cSMarkus Armbruster : "Unknown");
755329c9b10SDr. David Alan Gilbert }
756329c9b10SDr. David Alan Gilbert
757329c9b10SDr. David Alan Gilbert /*
7582c88739cSMarkus Armbruster * Trace RDMA gid addressing information.
7592c88739cSMarkus Armbruster * Useful for understanding the RDMA device hierarchy in the kernel.
760329c9b10SDr. David Alan Gilbert */
qemu_rdma_dump_gid(const char * who,struct rdma_cm_id * id)761329c9b10SDr. David Alan Gilbert static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id)
762329c9b10SDr. David Alan Gilbert {
763329c9b10SDr. David Alan Gilbert char sgid[33];
764329c9b10SDr. David Alan Gilbert char dgid[33];
765329c9b10SDr. David Alan Gilbert inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.sgid, sgid, sizeof sgid);
766329c9b10SDr. David Alan Gilbert inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.dgid, dgid, sizeof dgid);
767733252deSDr. David Alan Gilbert trace_qemu_rdma_dump_gid(who, sgid, dgid);
768329c9b10SDr. David Alan Gilbert }
769329c9b10SDr. David Alan Gilbert
770329c9b10SDr. David Alan Gilbert /*
771329c9b10SDr. David Alan Gilbert * As of now, IPv6 over RoCE / iWARP is not supported by linux.
772329c9b10SDr. David Alan Gilbert * We will try the next addrinfo struct, and fail if there are
773329c9b10SDr. David Alan Gilbert * no other valid addresses to bind against.
774329c9b10SDr. David Alan Gilbert *
775329c9b10SDr. David Alan Gilbert * If user is listening on '[::]', then we will not have a opened a device
776329c9b10SDr. David Alan Gilbert * yet and have no way of verifying if the device is RoCE or not.
777329c9b10SDr. David Alan Gilbert *
778329c9b10SDr. David Alan Gilbert * In this case, the source VM will throw an error for ALL types of
779329c9b10SDr. David Alan Gilbert * connections (both IPv4 and IPv6) if the destination machine does not have
780329c9b10SDr. David Alan Gilbert * a regular infiniband network available for use.
781329c9b10SDr. David Alan Gilbert *
782329c9b10SDr. David Alan Gilbert * The only way to guarantee that an error is thrown for broken kernels is
783329c9b10SDr. David Alan Gilbert * for the management software to choose a *specific* interface at bind time
784329c9b10SDr. David Alan Gilbert * and validate what time of hardware it is.
785329c9b10SDr. David Alan Gilbert *
786329c9b10SDr. David Alan Gilbert * Unfortunately, this puts the user in a fix:
787329c9b10SDr. David Alan Gilbert *
788329c9b10SDr. David Alan Gilbert * If the source VM connects with an IPv4 address without knowing that the
789329c9b10SDr. David Alan Gilbert * destination has bound to '[::]' the migration will unconditionally fail
790b6af0975SDaniel P. Berrange * unless the management software is explicitly listening on the IPv4
791329c9b10SDr. David Alan Gilbert * address while using a RoCE-based device.
792329c9b10SDr. David Alan Gilbert *
793329c9b10SDr. David Alan Gilbert * If the source VM connects with an IPv6 address, then we're OK because we can
794329c9b10SDr. David Alan Gilbert * throw an error on the source (and similarly on the destination).
795329c9b10SDr. David Alan Gilbert *
796329c9b10SDr. David Alan Gilbert * But in mixed environments, this will be broken for a while until it is fixed
797329c9b10SDr. David Alan Gilbert * inside linux.
798329c9b10SDr. David Alan Gilbert *
799329c9b10SDr. David Alan Gilbert * We do provide a *tiny* bit of help in this function: We can list all of the
800329c9b10SDr. David Alan Gilbert * devices in the system and check to see if all the devices are RoCE or
801329c9b10SDr. David Alan Gilbert * Infiniband.
802329c9b10SDr. David Alan Gilbert *
803329c9b10SDr. David Alan Gilbert * If we detect that we have a *pure* RoCE environment, then we can safely
804329c9b10SDr. David Alan Gilbert * thrown an error even if the management software has specified '[::]' as the
805329c9b10SDr. David Alan Gilbert * bind address.
806329c9b10SDr. David Alan Gilbert *
807329c9b10SDr. David Alan Gilbert * However, if there is are multiple hetergeneous devices, then we cannot make
808329c9b10SDr. David Alan Gilbert * this assumption and the user just has to be sure they know what they are
809329c9b10SDr. David Alan Gilbert * doing.
810329c9b10SDr. David Alan Gilbert *
811329c9b10SDr. David Alan Gilbert * Patches are being reviewed on linux-rdma.
812329c9b10SDr. David Alan Gilbert */
qemu_rdma_broken_ipv6_kernel(struct ibv_context * verbs,Error ** errp)813bbfb89e3SFam Zheng static int qemu_rdma_broken_ipv6_kernel(struct ibv_context *verbs, Error **errp)
814329c9b10SDr. David Alan Gilbert {
815329c9b10SDr. David Alan Gilbert /* This bug only exists in linux, to our knowledge. */
816329c9b10SDr. David Alan Gilbert #ifdef CONFIG_LINUX
8171f4abd81SAlex Bennée struct ibv_port_attr port_attr;
818329c9b10SDr. David Alan Gilbert
819329c9b10SDr. David Alan Gilbert /*
820329c9b10SDr. David Alan Gilbert * Verbs are only NULL if management has bound to '[::]'.
821329c9b10SDr. David Alan Gilbert *
822329c9b10SDr. David Alan Gilbert * Let's iterate through all the devices and see if there any pure IB
823329c9b10SDr. David Alan Gilbert * devices (non-ethernet).
824329c9b10SDr. David Alan Gilbert *
825329c9b10SDr. David Alan Gilbert * If not, then we can safely proceed with the migration.
826329c9b10SDr. David Alan Gilbert * Otherwise, there are no guarantees until the bug is fixed in linux.
827329c9b10SDr. David Alan Gilbert */
828329c9b10SDr. David Alan Gilbert if (!verbs) {
82914e2fcbbSJuan Quintela int num_devices;
830329c9b10SDr. David Alan Gilbert struct ibv_device **dev_list = ibv_get_device_list(&num_devices);
831329c9b10SDr. David Alan Gilbert bool roce_found = false;
832329c9b10SDr. David Alan Gilbert bool ib_found = false;
833329c9b10SDr. David Alan Gilbert
83414e2fcbbSJuan Quintela for (int x = 0; x < num_devices; x++) {
835329c9b10SDr. David Alan Gilbert verbs = ibv_open_device(dev_list[x]);
8360bc26045SMarkus Armbruster /*
8370bc26045SMarkus Armbruster * ibv_open_device() is not documented to set errno. If
8380bc26045SMarkus Armbruster * it does, it's somebody else's doc bug. If it doesn't,
8390bc26045SMarkus Armbruster * the use of errno below is wrong.
8400bc26045SMarkus Armbruster * TODO Find out whether ibv_open_device() sets errno.
8410bc26045SMarkus Armbruster */
8425b61d575SPadmanabh Ratnakar if (!verbs) {
8435b61d575SPadmanabh Ratnakar if (errno == EPERM) {
8445b61d575SPadmanabh Ratnakar continue;
8455b61d575SPadmanabh Ratnakar } else {
846142bd685SMarkus Armbruster error_setg_errno(errp, errno,
847142bd685SMarkus Armbruster "could not open RDMA device context");
8488c6513f7SMarkus Armbruster return -1;
8495b61d575SPadmanabh Ratnakar }
8505b61d575SPadmanabh Ratnakar }
851329c9b10SDr. David Alan Gilbert
852329c9b10SDr. David Alan Gilbert if (ibv_query_port(verbs, 1, &port_attr)) {
853329c9b10SDr. David Alan Gilbert ibv_close_device(verbs);
8548fd471bdSMarkus Armbruster error_setg(errp,
8558fd471bdSMarkus Armbruster "RDMA ERROR: Could not query initial IB port");
8568c6513f7SMarkus Armbruster return -1;
857329c9b10SDr. David Alan Gilbert }
858329c9b10SDr. David Alan Gilbert
859329c9b10SDr. David Alan Gilbert if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) {
860329c9b10SDr. David Alan Gilbert ib_found = true;
861329c9b10SDr. David Alan Gilbert } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
862329c9b10SDr. David Alan Gilbert roce_found = true;
863329c9b10SDr. David Alan Gilbert }
864329c9b10SDr. David Alan Gilbert
865329c9b10SDr. David Alan Gilbert ibv_close_device(verbs);
866329c9b10SDr. David Alan Gilbert
867329c9b10SDr. David Alan Gilbert }
868329c9b10SDr. David Alan Gilbert
869329c9b10SDr. David Alan Gilbert if (roce_found) {
870329c9b10SDr. David Alan Gilbert if (ib_found) {
871ff4c9194SMarkus Armbruster warn_report("migrations may fail:"
872329c9b10SDr. David Alan Gilbert " IPv6 over RoCE / iWARP in linux"
873329c9b10SDr. David Alan Gilbert " is broken. But since you appear to have a"
874329c9b10SDr. David Alan Gilbert " mixed RoCE / IB environment, be sure to only"
875329c9b10SDr. David Alan Gilbert " migrate over the IB fabric until the kernel "
876ff4c9194SMarkus Armbruster " fixes the bug.");
877329c9b10SDr. David Alan Gilbert } else {
8788fd471bdSMarkus Armbruster error_setg(errp, "RDMA ERROR: "
8798fd471bdSMarkus Armbruster "You only have RoCE / iWARP devices in your systems"
880329c9b10SDr. David Alan Gilbert " and your management software has specified '[::]'"
881329c9b10SDr. David Alan Gilbert ", but IPv6 over RoCE / iWARP is not supported in Linux.");
8828c6513f7SMarkus Armbruster return -1;
883329c9b10SDr. David Alan Gilbert }
884329c9b10SDr. David Alan Gilbert }
885329c9b10SDr. David Alan Gilbert
886329c9b10SDr. David Alan Gilbert return 0;
887329c9b10SDr. David Alan Gilbert }
888329c9b10SDr. David Alan Gilbert
889329c9b10SDr. David Alan Gilbert /*
890329c9b10SDr. David Alan Gilbert * If we have a verbs context, that means that some other than '[::]' was
89102942db7SStefan Weil * used by the management software for binding. In which case we can
89202942db7SStefan Weil * actually warn the user about a potentially broken kernel.
893329c9b10SDr. David Alan Gilbert */
894329c9b10SDr. David Alan Gilbert
895329c9b10SDr. David Alan Gilbert /* IB ports start with 1, not 0 */
896329c9b10SDr. David Alan Gilbert if (ibv_query_port(verbs, 1, &port_attr)) {
8978fd471bdSMarkus Armbruster error_setg(errp, "RDMA ERROR: Could not query initial IB port");
8988c6513f7SMarkus Armbruster return -1;
899329c9b10SDr. David Alan Gilbert }
900329c9b10SDr. David Alan Gilbert
901329c9b10SDr. David Alan Gilbert if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
9028fd471bdSMarkus Armbruster error_setg(errp, "RDMA ERROR: "
9038fd471bdSMarkus Armbruster "Linux kernel's RoCE / iWARP does not support IPv6 "
904329c9b10SDr. David Alan Gilbert "(but patches on linux-rdma in progress)");
9058c6513f7SMarkus Armbruster return -1;
906329c9b10SDr. David Alan Gilbert }
907329c9b10SDr. David Alan Gilbert
908329c9b10SDr. David Alan Gilbert #endif
909329c9b10SDr. David Alan Gilbert
910329c9b10SDr. David Alan Gilbert return 0;
911329c9b10SDr. David Alan Gilbert }
912329c9b10SDr. David Alan Gilbert
913329c9b10SDr. David Alan Gilbert /*
914329c9b10SDr. David Alan Gilbert * Figure out which RDMA device corresponds to the requested IP hostname
915329c9b10SDr. David Alan Gilbert * Also create the initial connection manager identifiers for opening
916329c9b10SDr. David Alan Gilbert * the connection.
917329c9b10SDr. David Alan Gilbert */
qemu_rdma_resolve_host(RDMAContext * rdma,Error ** errp)918329c9b10SDr. David Alan Gilbert static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp)
919329c9b10SDr. David Alan Gilbert {
920071d5ae4SMarkus Armbruster Error *err = NULL;
921329c9b10SDr. David Alan Gilbert int ret;
922329c9b10SDr. David Alan Gilbert struct rdma_addrinfo *res;
923329c9b10SDr. David Alan Gilbert char port_str[16];
924329c9b10SDr. David Alan Gilbert struct rdma_cm_event *cm_event;
925329c9b10SDr. David Alan Gilbert char ip[40] = "unknown";
926329c9b10SDr. David Alan Gilbert
927329c9b10SDr. David Alan Gilbert if (rdma->host == NULL || !strcmp(rdma->host, "")) {
9288fd471bdSMarkus Armbruster error_setg(errp, "RDMA ERROR: RDMA hostname has not been set");
92907249822SMarkus Armbruster return -1;
930329c9b10SDr. David Alan Gilbert }
931329c9b10SDr. David Alan Gilbert
932329c9b10SDr. David Alan Gilbert /* create CM channel */
933329c9b10SDr. David Alan Gilbert rdma->channel = rdma_create_event_channel();
934329c9b10SDr. David Alan Gilbert if (!rdma->channel) {
9358fd471bdSMarkus Armbruster error_setg(errp, "RDMA ERROR: could not create CM channel");
93607249822SMarkus Armbruster return -1;
937329c9b10SDr. David Alan Gilbert }
938329c9b10SDr. David Alan Gilbert
939329c9b10SDr. David Alan Gilbert /* create CM id */
940329c9b10SDr. David Alan Gilbert ret = rdma_create_id(rdma->channel, &rdma->cm_id, NULL, RDMA_PS_TCP);
9414a102179SMarkus Armbruster if (ret < 0) {
9428fd471bdSMarkus Armbruster error_setg(errp, "RDMA ERROR: could not create channel id");
943329c9b10SDr. David Alan Gilbert goto err_resolve_create_id;
944329c9b10SDr. David Alan Gilbert }
945329c9b10SDr. David Alan Gilbert
946329c9b10SDr. David Alan Gilbert snprintf(port_str, 16, "%d", rdma->port);
947329c9b10SDr. David Alan Gilbert port_str[15] = '\0';
948329c9b10SDr. David Alan Gilbert
949329c9b10SDr. David Alan Gilbert ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
95007249822SMarkus Armbruster if (ret) {
9518fd471bdSMarkus Armbruster error_setg(errp, "RDMA ERROR: could not rdma_getaddrinfo address %s",
9528fd471bdSMarkus Armbruster rdma->host);
953329c9b10SDr. David Alan Gilbert goto err_resolve_get_addr;
954329c9b10SDr. David Alan Gilbert }
955329c9b10SDr. David Alan Gilbert
956071d5ae4SMarkus Armbruster /* Try all addresses, saving the first error in @err */
95714e2fcbbSJuan Quintela for (struct rdma_addrinfo *e = res; e != NULL; e = e->ai_next) {
958071d5ae4SMarkus Armbruster Error **local_errp = err ? NULL : &err;
959071d5ae4SMarkus Armbruster
960329c9b10SDr. David Alan Gilbert inet_ntop(e->ai_family,
961329c9b10SDr. David Alan Gilbert &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
962733252deSDr. David Alan Gilbert trace_qemu_rdma_resolve_host_trying(rdma->host, ip);
963329c9b10SDr. David Alan Gilbert
964329c9b10SDr. David Alan Gilbert ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr,
965329c9b10SDr. David Alan Gilbert RDMA_RESOLVE_TIMEOUT_MS);
9664a102179SMarkus Armbruster if (ret >= 0) {
967329c9b10SDr. David Alan Gilbert if (e->ai_family == AF_INET6) {
968071d5ae4SMarkus Armbruster ret = qemu_rdma_broken_ipv6_kernel(rdma->cm_id->verbs,
969071d5ae4SMarkus Armbruster local_errp);
9704a102179SMarkus Armbruster if (ret < 0) {
971329c9b10SDr. David Alan Gilbert continue;
972329c9b10SDr. David Alan Gilbert }
973329c9b10SDr. David Alan Gilbert }
974071d5ae4SMarkus Armbruster error_free(err);
975329c9b10SDr. David Alan Gilbert goto route;
976329c9b10SDr. David Alan Gilbert }
977329c9b10SDr. David Alan Gilbert }
978329c9b10SDr. David Alan Gilbert
979f53b450aSLi Zhijian rdma_freeaddrinfo(res);
980071d5ae4SMarkus Armbruster if (err) {
981071d5ae4SMarkus Armbruster error_propagate(errp, err);
982071d5ae4SMarkus Armbruster } else {
9838fd471bdSMarkus Armbruster error_setg(errp, "RDMA ERROR: could not resolve address %s",
9848fd471bdSMarkus Armbruster rdma->host);
9858fd471bdSMarkus Armbruster }
986329c9b10SDr. David Alan Gilbert goto err_resolve_get_addr;
987329c9b10SDr. David Alan Gilbert
988329c9b10SDr. David Alan Gilbert route:
989f53b450aSLi Zhijian rdma_freeaddrinfo(res);
990329c9b10SDr. David Alan Gilbert qemu_rdma_dump_gid("source_resolve_addr", rdma->cm_id);
991329c9b10SDr. David Alan Gilbert
992329c9b10SDr. David Alan Gilbert ret = rdma_get_cm_event(rdma->channel, &cm_event);
9934a102179SMarkus Armbruster if (ret < 0) {
9948fd471bdSMarkus Armbruster error_setg(errp, "RDMA ERROR: could not perform event_addr_resolved");
995329c9b10SDr. David Alan Gilbert goto err_resolve_get_addr;
996329c9b10SDr. David Alan Gilbert }
997329c9b10SDr. David Alan Gilbert
998329c9b10SDr. David Alan Gilbert if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
9998fd471bdSMarkus Armbruster error_setg(errp,
10008fd471bdSMarkus Armbruster "RDMA ERROR: result not equal to event_addr_resolved %s",
1001329c9b10SDr. David Alan Gilbert rdma_event_str(cm_event->event));
1002329c9b10SDr. David Alan Gilbert rdma_ack_cm_event(cm_event);
1003329c9b10SDr. David Alan Gilbert goto err_resolve_get_addr;
1004329c9b10SDr. David Alan Gilbert }
1005329c9b10SDr. David Alan Gilbert rdma_ack_cm_event(cm_event);
1006329c9b10SDr. David Alan Gilbert
1007329c9b10SDr. David Alan Gilbert /* resolve route */
1008329c9b10SDr. David Alan Gilbert ret = rdma_resolve_route(rdma->cm_id, RDMA_RESOLVE_TIMEOUT_MS);
10094a102179SMarkus Armbruster if (ret < 0) {
10108fd471bdSMarkus Armbruster error_setg(errp, "RDMA ERROR: could not resolve rdma route");
1011329c9b10SDr. David Alan Gilbert goto err_resolve_get_addr;
1012329c9b10SDr. David Alan Gilbert }
1013329c9b10SDr. David Alan Gilbert
1014329c9b10SDr. David Alan Gilbert ret = rdma_get_cm_event(rdma->channel, &cm_event);
10154a102179SMarkus Armbruster if (ret < 0) {
10168fd471bdSMarkus Armbruster error_setg(errp, "RDMA ERROR: could not perform event_route_resolved");
1017329c9b10SDr. David Alan Gilbert goto err_resolve_get_addr;
1018329c9b10SDr. David Alan Gilbert }
1019329c9b10SDr. David Alan Gilbert if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
10208fd471bdSMarkus Armbruster error_setg(errp, "RDMA ERROR: "
10218fd471bdSMarkus Armbruster "result not equal to event_route_resolved: %s",
1022329c9b10SDr. David Alan Gilbert rdma_event_str(cm_event->event));
1023329c9b10SDr. David Alan Gilbert rdma_ack_cm_event(cm_event);
1024329c9b10SDr. David Alan Gilbert goto err_resolve_get_addr;
1025329c9b10SDr. David Alan Gilbert }
1026329c9b10SDr. David Alan Gilbert rdma_ack_cm_event(cm_event);
1027329c9b10SDr. David Alan Gilbert rdma->verbs = rdma->cm_id->verbs;
1028329c9b10SDr. David Alan Gilbert qemu_rdma_dump_id("source_resolve_host", rdma->cm_id->verbs);
1029329c9b10SDr. David Alan Gilbert qemu_rdma_dump_gid("source_resolve_host", rdma->cm_id);
1030329c9b10SDr. David Alan Gilbert return 0;
1031329c9b10SDr. David Alan Gilbert
1032329c9b10SDr. David Alan Gilbert err_resolve_get_addr:
1033329c9b10SDr. David Alan Gilbert rdma_destroy_id(rdma->cm_id);
1034329c9b10SDr. David Alan Gilbert rdma->cm_id = NULL;
1035329c9b10SDr. David Alan Gilbert err_resolve_create_id:
1036329c9b10SDr. David Alan Gilbert rdma_destroy_event_channel(rdma->channel);
1037329c9b10SDr. David Alan Gilbert rdma->channel = NULL;
103807249822SMarkus Armbruster return -1;
1039329c9b10SDr. David Alan Gilbert }
1040329c9b10SDr. David Alan Gilbert
1041329c9b10SDr. David Alan Gilbert /*
1042329c9b10SDr. David Alan Gilbert * Create protection domain and completion queues
1043329c9b10SDr. David Alan Gilbert */
qemu_rdma_alloc_pd_cq(RDMAContext * rdma,Error ** errp)104407d5b946SMarkus Armbruster static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma, Error **errp)
1045329c9b10SDr. David Alan Gilbert {
1046329c9b10SDr. David Alan Gilbert /* allocate pd */
1047329c9b10SDr. David Alan Gilbert rdma->pd = ibv_alloc_pd(rdma->verbs);
1048329c9b10SDr. David Alan Gilbert if (!rdma->pd) {
104907d5b946SMarkus Armbruster error_setg(errp, "failed to allocate protection domain");
1050329c9b10SDr. David Alan Gilbert return -1;
1051329c9b10SDr. David Alan Gilbert }
1052329c9b10SDr. David Alan Gilbert
1053b390afd8SLi Zhijian /* create receive completion channel */
1054b390afd8SLi Zhijian rdma->recv_comp_channel = ibv_create_comp_channel(rdma->verbs);
1055b390afd8SLi Zhijian if (!rdma->recv_comp_channel) {
105607d5b946SMarkus Armbruster error_setg(errp, "failed to allocate receive completion channel");
1057329c9b10SDr. David Alan Gilbert goto err_alloc_pd_cq;
1058329c9b10SDr. David Alan Gilbert }
1059329c9b10SDr. David Alan Gilbert
1060329c9b10SDr. David Alan Gilbert /*
1061b390afd8SLi Zhijian * Completion queue can be filled by read work requests.
1062329c9b10SDr. David Alan Gilbert */
1063b390afd8SLi Zhijian rdma->recv_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
1064b390afd8SLi Zhijian NULL, rdma->recv_comp_channel, 0);
1065b390afd8SLi Zhijian if (!rdma->recv_cq) {
106607d5b946SMarkus Armbruster error_setg(errp, "failed to allocate receive completion queue");
1067b390afd8SLi Zhijian goto err_alloc_pd_cq;
1068b390afd8SLi Zhijian }
1069b390afd8SLi Zhijian
1070b390afd8SLi Zhijian /* create send completion channel */
1071b390afd8SLi Zhijian rdma->send_comp_channel = ibv_create_comp_channel(rdma->verbs);
1072b390afd8SLi Zhijian if (!rdma->send_comp_channel) {
107307d5b946SMarkus Armbruster error_setg(errp, "failed to allocate send completion channel");
1074b390afd8SLi Zhijian goto err_alloc_pd_cq;
1075b390afd8SLi Zhijian }
1076b390afd8SLi Zhijian
1077b390afd8SLi Zhijian rdma->send_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
1078b390afd8SLi Zhijian NULL, rdma->send_comp_channel, 0);
1079b390afd8SLi Zhijian if (!rdma->send_cq) {
108007d5b946SMarkus Armbruster error_setg(errp, "failed to allocate send completion queue");
1081329c9b10SDr. David Alan Gilbert goto err_alloc_pd_cq;
1082329c9b10SDr. David Alan Gilbert }
1083329c9b10SDr. David Alan Gilbert
1084329c9b10SDr. David Alan Gilbert return 0;
1085329c9b10SDr. David Alan Gilbert
1086329c9b10SDr. David Alan Gilbert err_alloc_pd_cq:
1087329c9b10SDr. David Alan Gilbert if (rdma->pd) {
1088329c9b10SDr. David Alan Gilbert ibv_dealloc_pd(rdma->pd);
1089329c9b10SDr. David Alan Gilbert }
1090b390afd8SLi Zhijian if (rdma->recv_comp_channel) {
1091b390afd8SLi Zhijian ibv_destroy_comp_channel(rdma->recv_comp_channel);
1092b390afd8SLi Zhijian }
1093b390afd8SLi Zhijian if (rdma->send_comp_channel) {
1094b390afd8SLi Zhijian ibv_destroy_comp_channel(rdma->send_comp_channel);
1095b390afd8SLi Zhijian }
1096b390afd8SLi Zhijian if (rdma->recv_cq) {
1097b390afd8SLi Zhijian ibv_destroy_cq(rdma->recv_cq);
1098b390afd8SLi Zhijian rdma->recv_cq = NULL;
1099329c9b10SDr. David Alan Gilbert }
1100329c9b10SDr. David Alan Gilbert rdma->pd = NULL;
1101b390afd8SLi Zhijian rdma->recv_comp_channel = NULL;
1102b390afd8SLi Zhijian rdma->send_comp_channel = NULL;
1103329c9b10SDr. David Alan Gilbert return -1;
1104329c9b10SDr. David Alan Gilbert
1105329c9b10SDr. David Alan Gilbert }
1106329c9b10SDr. David Alan Gilbert
1107329c9b10SDr. David Alan Gilbert /*
1108329c9b10SDr. David Alan Gilbert * Create queue pairs.
1109329c9b10SDr. David Alan Gilbert */
qemu_rdma_alloc_qp(RDMAContext * rdma)1110329c9b10SDr. David Alan Gilbert static int qemu_rdma_alloc_qp(RDMAContext *rdma)
1111329c9b10SDr. David Alan Gilbert {
1112329c9b10SDr. David Alan Gilbert struct ibv_qp_init_attr attr = { 0 };
1113329c9b10SDr. David Alan Gilbert
1114329c9b10SDr. David Alan Gilbert attr.cap.max_send_wr = RDMA_SIGNALED_SEND_MAX;
1115329c9b10SDr. David Alan Gilbert attr.cap.max_recv_wr = 3;
1116329c9b10SDr. David Alan Gilbert attr.cap.max_send_sge = 1;
1117329c9b10SDr. David Alan Gilbert attr.cap.max_recv_sge = 1;
1118b390afd8SLi Zhijian attr.send_cq = rdma->send_cq;
1119b390afd8SLi Zhijian attr.recv_cq = rdma->recv_cq;
1120329c9b10SDr. David Alan Gilbert attr.qp_type = IBV_QPT_RC;
1121329c9b10SDr. David Alan Gilbert
11228f5a7faaSJuan Quintela if (rdma_create_qp(rdma->cm_id, rdma->pd, &attr) < 0) {
1123329c9b10SDr. David Alan Gilbert return -1;
1124329c9b10SDr. David Alan Gilbert }
1125329c9b10SDr. David Alan Gilbert
1126329c9b10SDr. David Alan Gilbert rdma->qp = rdma->cm_id->qp;
1127329c9b10SDr. David Alan Gilbert return 0;
1128329c9b10SDr. David Alan Gilbert }
1129329c9b10SDr. David Alan Gilbert
1130e2daccb0SLi Zhijian /* Check whether On-Demand Paging is supported by RDAM device */
rdma_support_odp(struct ibv_context * dev)1131e2daccb0SLi Zhijian static bool rdma_support_odp(struct ibv_context *dev)
1132e2daccb0SLi Zhijian {
1133e2daccb0SLi Zhijian struct ibv_device_attr_ex attr = {0};
11348f5a7faaSJuan Quintela
11358f5a7faaSJuan Quintela if (ibv_query_device_ex(dev, NULL, &attr)) {
1136e2daccb0SLi Zhijian return false;
1137e2daccb0SLi Zhijian }
1138e2daccb0SLi Zhijian
1139e2daccb0SLi Zhijian if (attr.odp_caps.general_caps & IBV_ODP_SUPPORT) {
1140e2daccb0SLi Zhijian return true;
1141e2daccb0SLi Zhijian }
1142e2daccb0SLi Zhijian
1143e2daccb0SLi Zhijian return false;
1144e2daccb0SLi Zhijian }
1145e2daccb0SLi Zhijian
1146911965acSLi Zhijian /*
1147911965acSLi Zhijian * ibv_advise_mr to avoid RNR NAK error as far as possible.
1148911965acSLi Zhijian * The responder mr registering with ODP will sent RNR NAK back to
1149911965acSLi Zhijian * the requester in the face of the page fault.
1150911965acSLi Zhijian */
qemu_rdma_advise_prefetch_mr(struct ibv_pd * pd,uint64_t addr,uint32_t len,uint32_t lkey,const char * name,bool wr)1151911965acSLi Zhijian static void qemu_rdma_advise_prefetch_mr(struct ibv_pd *pd, uint64_t addr,
1152911965acSLi Zhijian uint32_t len, uint32_t lkey,
1153911965acSLi Zhijian const char *name, bool wr)
1154911965acSLi Zhijian {
1155911965acSLi Zhijian #ifdef HAVE_IBV_ADVISE_MR
1156911965acSLi Zhijian int ret;
1157911965acSLi Zhijian int advice = wr ? IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE :
1158911965acSLi Zhijian IBV_ADVISE_MR_ADVICE_PREFETCH;
1159911965acSLi Zhijian struct ibv_sge sg_list = {.lkey = lkey, .addr = addr, .length = len};
1160911965acSLi Zhijian
1161911965acSLi Zhijian ret = ibv_advise_mr(pd, advice,
1162911965acSLi Zhijian IBV_ADVISE_MR_FLAG_FLUSH, &sg_list, 1);
1163911965acSLi Zhijian /* ignore the error */
11640bc26045SMarkus Armbruster trace_qemu_rdma_advise_mr(name, len, addr, strerror(ret));
1165911965acSLi Zhijian #endif
1166911965acSLi Zhijian }
1167911965acSLi Zhijian
qemu_rdma_reg_whole_ram_blocks(RDMAContext * rdma,Error ** errp)1168de1aa35fSMarkus Armbruster static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma, Error **errp)
1169329c9b10SDr. David Alan Gilbert {
1170329c9b10SDr. David Alan Gilbert int i;
1171329c9b10SDr. David Alan Gilbert RDMALocalBlocks *local = &rdma->local_ram_blocks;
1172329c9b10SDr. David Alan Gilbert
1173329c9b10SDr. David Alan Gilbert for (i = 0; i < local->nb_blocks; i++) {
1174e2daccb0SLi Zhijian int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE;
1175e2daccb0SLi Zhijian
1176329c9b10SDr. David Alan Gilbert local->block[i].mr =
1177329c9b10SDr. David Alan Gilbert ibv_reg_mr(rdma->pd,
1178329c9b10SDr. David Alan Gilbert local->block[i].local_host_addr,
1179e2daccb0SLi Zhijian local->block[i].length, access
1180329c9b10SDr. David Alan Gilbert );
11810bc26045SMarkus Armbruster /*
11820bc26045SMarkus Armbruster * ibv_reg_mr() is not documented to set errno. If it does,
11830bc26045SMarkus Armbruster * it's somebody else's doc bug. If it doesn't, the use of
11840bc26045SMarkus Armbruster * errno below is wrong.
11850bc26045SMarkus Armbruster * TODO Find out whether ibv_reg_mr() sets errno.
11860bc26045SMarkus Armbruster */
1187e2daccb0SLi Zhijian if (!local->block[i].mr &&
1188e2daccb0SLi Zhijian errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
1189e2daccb0SLi Zhijian access |= IBV_ACCESS_ON_DEMAND;
1190e2daccb0SLi Zhijian /* register ODP mr */
1191e2daccb0SLi Zhijian local->block[i].mr =
1192e2daccb0SLi Zhijian ibv_reg_mr(rdma->pd,
1193e2daccb0SLi Zhijian local->block[i].local_host_addr,
1194e2daccb0SLi Zhijian local->block[i].length, access);
1195e2daccb0SLi Zhijian trace_qemu_rdma_register_odp_mr(local->block[i].block_name);
1196911965acSLi Zhijian
1197911965acSLi Zhijian if (local->block[i].mr) {
1198911965acSLi Zhijian qemu_rdma_advise_prefetch_mr(rdma->pd,
1199911965acSLi Zhijian (uintptr_t)local->block[i].local_host_addr,
1200911965acSLi Zhijian local->block[i].length,
1201911965acSLi Zhijian local->block[i].mr->lkey,
1202911965acSLi Zhijian local->block[i].block_name,
1203911965acSLi Zhijian true);
1204911965acSLi Zhijian }
1205e2daccb0SLi Zhijian }
1206e2daccb0SLi Zhijian
1207329c9b10SDr. David Alan Gilbert if (!local->block[i].mr) {
1208de1aa35fSMarkus Armbruster error_setg_errno(errp, errno,
1209de1aa35fSMarkus Armbruster "Failed to register local dest ram block!");
1210de1aa35fSMarkus Armbruster goto err;
1211329c9b10SDr. David Alan Gilbert }
1212329c9b10SDr. David Alan Gilbert rdma->total_registrations++;
1213329c9b10SDr. David Alan Gilbert }
1214329c9b10SDr. David Alan Gilbert
1215329c9b10SDr. David Alan Gilbert return 0;
1216329c9b10SDr. David Alan Gilbert
1217de1aa35fSMarkus Armbruster err:
1218329c9b10SDr. David Alan Gilbert for (i--; i >= 0; i--) {
1219329c9b10SDr. David Alan Gilbert ibv_dereg_mr(local->block[i].mr);
1220224f364aSLi Zhijian local->block[i].mr = NULL;
1221329c9b10SDr. David Alan Gilbert rdma->total_registrations--;
1222329c9b10SDr. David Alan Gilbert }
1223329c9b10SDr. David Alan Gilbert
1224329c9b10SDr. David Alan Gilbert return -1;
1225329c9b10SDr. David Alan Gilbert
1226329c9b10SDr. David Alan Gilbert }
1227329c9b10SDr. David Alan Gilbert
1228329c9b10SDr. David Alan Gilbert /*
1229329c9b10SDr. David Alan Gilbert * Find the ram block that corresponds to the page requested to be
1230329c9b10SDr. David Alan Gilbert * transmitted by QEMU.
1231329c9b10SDr. David Alan Gilbert *
1232329c9b10SDr. David Alan Gilbert * Once the block is found, also identify which 'chunk' within that
1233329c9b10SDr. David Alan Gilbert * block that the page belongs to.
1234329c9b10SDr. David Alan Gilbert */
qemu_rdma_search_ram_block(RDMAContext * rdma,uintptr_t block_offset,uint64_t offset,uint64_t length,uint64_t * block_index,uint64_t * chunk_index)123587e6bdabSMarkus Armbruster static void qemu_rdma_search_ram_block(RDMAContext *rdma,
1236fbce8c25SStefan Weil uintptr_t block_offset,
1237329c9b10SDr. David Alan Gilbert uint64_t offset,
1238329c9b10SDr. David Alan Gilbert uint64_t length,
1239329c9b10SDr. David Alan Gilbert uint64_t *block_index,
1240329c9b10SDr. David Alan Gilbert uint64_t *chunk_index)
1241329c9b10SDr. David Alan Gilbert {
1242329c9b10SDr. David Alan Gilbert uint64_t current_addr = block_offset + offset;
1243329c9b10SDr. David Alan Gilbert RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap,
1244329c9b10SDr. David Alan Gilbert (void *) block_offset);
1245329c9b10SDr. David Alan Gilbert assert(block);
1246329c9b10SDr. David Alan Gilbert assert(current_addr >= block->offset);
1247329c9b10SDr. David Alan Gilbert assert((current_addr + length) <= (block->offset + block->length));
1248329c9b10SDr. David Alan Gilbert
1249329c9b10SDr. David Alan Gilbert *block_index = block->index;
1250329c9b10SDr. David Alan Gilbert *chunk_index = ram_chunk_index(block->local_host_addr,
1251329c9b10SDr. David Alan Gilbert block->local_host_addr + (current_addr - block->offset));
1252329c9b10SDr. David Alan Gilbert }
1253329c9b10SDr. David Alan Gilbert
1254329c9b10SDr. David Alan Gilbert /*
1255329c9b10SDr. David Alan Gilbert * Register a chunk with IB. If the chunk was already registered
1256329c9b10SDr. David Alan Gilbert * previously, then skip.
1257329c9b10SDr. David Alan Gilbert *
1258329c9b10SDr. David Alan Gilbert * Also return the keys associated with the registration needed
1259329c9b10SDr. David Alan Gilbert * to perform the actual RDMA operation.
1260329c9b10SDr. David Alan Gilbert */
qemu_rdma_register_and_get_keys(RDMAContext * rdma,RDMALocalBlock * block,uintptr_t host_addr,uint32_t * lkey,uint32_t * rkey,int chunk,uint8_t * chunk_start,uint8_t * chunk_end)1261329c9b10SDr. David Alan Gilbert static int qemu_rdma_register_and_get_keys(RDMAContext *rdma,
12623ac040c0SStefan Weil RDMALocalBlock *block, uintptr_t host_addr,
1263329c9b10SDr. David Alan Gilbert uint32_t *lkey, uint32_t *rkey, int chunk,
1264329c9b10SDr. David Alan Gilbert uint8_t *chunk_start, uint8_t *chunk_end)
1265329c9b10SDr. David Alan Gilbert {
1266329c9b10SDr. David Alan Gilbert if (block->mr) {
1267329c9b10SDr. David Alan Gilbert if (lkey) {
1268329c9b10SDr. David Alan Gilbert *lkey = block->mr->lkey;
1269329c9b10SDr. David Alan Gilbert }
1270329c9b10SDr. David Alan Gilbert if (rkey) {
1271329c9b10SDr. David Alan Gilbert *rkey = block->mr->rkey;
1272329c9b10SDr. David Alan Gilbert }
1273329c9b10SDr. David Alan Gilbert return 0;
1274329c9b10SDr. David Alan Gilbert }
1275329c9b10SDr. David Alan Gilbert
1276329c9b10SDr. David Alan Gilbert /* allocate memory to store chunk MRs */
1277329c9b10SDr. David Alan Gilbert if (!block->pmr) {
127897f3ad35SMarkus Armbruster block->pmr = g_new0(struct ibv_mr *, block->nb_chunks);
1279329c9b10SDr. David Alan Gilbert }
1280329c9b10SDr. David Alan Gilbert
1281329c9b10SDr. David Alan Gilbert /*
1282329c9b10SDr. David Alan Gilbert * If 'rkey', then we're the destination, so grant access to the source.
1283329c9b10SDr. David Alan Gilbert *
1284329c9b10SDr. David Alan Gilbert * If 'lkey', then we're the source VM, so grant access only to ourselves.
1285329c9b10SDr. David Alan Gilbert */
1286329c9b10SDr. David Alan Gilbert if (!block->pmr[chunk]) {
1287329c9b10SDr. David Alan Gilbert uint64_t len = chunk_end - chunk_start;
1288e2daccb0SLi Zhijian int access = rkey ? IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE :
1289e2daccb0SLi Zhijian 0;
1290329c9b10SDr. David Alan Gilbert
1291733252deSDr. David Alan Gilbert trace_qemu_rdma_register_and_get_keys(len, chunk_start);
1292329c9b10SDr. David Alan Gilbert
1293e2daccb0SLi Zhijian block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
12940bc26045SMarkus Armbruster /*
12950bc26045SMarkus Armbruster * ibv_reg_mr() is not documented to set errno. If it does,
12960bc26045SMarkus Armbruster * it's somebody else's doc bug. If it doesn't, the use of
12970bc26045SMarkus Armbruster * errno below is wrong.
12980bc26045SMarkus Armbruster * TODO Find out whether ibv_reg_mr() sets errno.
12990bc26045SMarkus Armbruster */
1300e2daccb0SLi Zhijian if (!block->pmr[chunk] &&
1301e2daccb0SLi Zhijian errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
1302e2daccb0SLi Zhijian access |= IBV_ACCESS_ON_DEMAND;
1303e2daccb0SLi Zhijian /* register ODP mr */
1304e2daccb0SLi Zhijian block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
1305e2daccb0SLi Zhijian trace_qemu_rdma_register_odp_mr(block->block_name);
1306911965acSLi Zhijian
1307911965acSLi Zhijian if (block->pmr[chunk]) {
1308911965acSLi Zhijian qemu_rdma_advise_prefetch_mr(rdma->pd, (uintptr_t)chunk_start,
1309911965acSLi Zhijian len, block->pmr[chunk]->lkey,
1310911965acSLi Zhijian block->block_name, rkey);
1311911965acSLi Zhijian
1312911965acSLi Zhijian }
1313e2daccb0SLi Zhijian }
1314e2daccb0SLi Zhijian }
1315329c9b10SDr. David Alan Gilbert if (!block->pmr[chunk]) {
1316329c9b10SDr. David Alan Gilbert return -1;
1317329c9b10SDr. David Alan Gilbert }
1318329c9b10SDr. David Alan Gilbert rdma->total_registrations++;
1319329c9b10SDr. David Alan Gilbert
1320329c9b10SDr. David Alan Gilbert if (lkey) {
1321329c9b10SDr. David Alan Gilbert *lkey = block->pmr[chunk]->lkey;
1322329c9b10SDr. David Alan Gilbert }
1323329c9b10SDr. David Alan Gilbert if (rkey) {
1324329c9b10SDr. David Alan Gilbert *rkey = block->pmr[chunk]->rkey;
1325329c9b10SDr. David Alan Gilbert }
1326329c9b10SDr. David Alan Gilbert return 0;
1327329c9b10SDr. David Alan Gilbert }
1328329c9b10SDr. David Alan Gilbert
1329329c9b10SDr. David Alan Gilbert /*
1330329c9b10SDr. David Alan Gilbert * Register (at connection time) the memory used for control
1331329c9b10SDr. David Alan Gilbert * channel messages.
1332329c9b10SDr. David Alan Gilbert */
qemu_rdma_reg_control(RDMAContext * rdma,int idx)1333329c9b10SDr. David Alan Gilbert static int qemu_rdma_reg_control(RDMAContext *rdma, int idx)
1334329c9b10SDr. David Alan Gilbert {
1335329c9b10SDr. David Alan Gilbert rdma->wr_data[idx].control_mr = ibv_reg_mr(rdma->pd,
1336329c9b10SDr. David Alan Gilbert rdma->wr_data[idx].control, RDMA_CONTROL_MAX_BUFFER,
1337329c9b10SDr. David Alan Gilbert IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
1338329c9b10SDr. David Alan Gilbert if (rdma->wr_data[idx].control_mr) {
1339329c9b10SDr. David Alan Gilbert rdma->total_registrations++;
1340329c9b10SDr. David Alan Gilbert return 0;
1341329c9b10SDr. David Alan Gilbert }
1342329c9b10SDr. David Alan Gilbert return -1;
1343329c9b10SDr. David Alan Gilbert }
1344329c9b10SDr. David Alan Gilbert
1345329c9b10SDr. David Alan Gilbert /*
1346329c9b10SDr. David Alan Gilbert * Perform a non-optimized memory unregistration after every transfer
134724ec68efSDr. David Alan Gilbert * for demonstration purposes, only if pin-all is not requested.
1348329c9b10SDr. David Alan Gilbert *
1349329c9b10SDr. David Alan Gilbert * Potential optimizations:
1350329c9b10SDr. David Alan Gilbert * 1. Start a new thread to run this function continuously
1351329c9b10SDr. David Alan Gilbert - for bit clearing
1352329c9b10SDr. David Alan Gilbert - and for receipt of unregister messages
1353329c9b10SDr. David Alan Gilbert * 2. Use an LRU.
1354329c9b10SDr. David Alan Gilbert * 3. Use workload hints.
1355329c9b10SDr. David Alan Gilbert */
qemu_rdma_unregister_waiting(RDMAContext * rdma)1356329c9b10SDr. David Alan Gilbert static int qemu_rdma_unregister_waiting(RDMAContext *rdma)
1357329c9b10SDr. David Alan Gilbert {
1358c4c78dceSMarkus Armbruster Error *err = NULL;
1359c4c78dceSMarkus Armbruster
1360329c9b10SDr. David Alan Gilbert while (rdma->unregistrations[rdma->unregister_current]) {
1361329c9b10SDr. David Alan Gilbert int ret;
1362329c9b10SDr. David Alan Gilbert uint64_t wr_id = rdma->unregistrations[rdma->unregister_current];
1363329c9b10SDr. David Alan Gilbert uint64_t chunk =
1364329c9b10SDr. David Alan Gilbert (wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
1365329c9b10SDr. David Alan Gilbert uint64_t index =
1366329c9b10SDr. David Alan Gilbert (wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
1367329c9b10SDr. David Alan Gilbert RDMALocalBlock *block =
1368329c9b10SDr. David Alan Gilbert &(rdma->local_ram_blocks.block[index]);
1369329c9b10SDr. David Alan Gilbert RDMARegister reg = { .current_index = index };
1370329c9b10SDr. David Alan Gilbert RDMAControlHeader resp = { .type = RDMA_CONTROL_UNREGISTER_FINISHED,
1371329c9b10SDr. David Alan Gilbert };
1372329c9b10SDr. David Alan Gilbert RDMAControlHeader head = { .len = sizeof(RDMARegister),
1373329c9b10SDr. David Alan Gilbert .type = RDMA_CONTROL_UNREGISTER_REQUEST,
1374329c9b10SDr. David Alan Gilbert .repeat = 1,
1375329c9b10SDr. David Alan Gilbert };
1376329c9b10SDr. David Alan Gilbert
1377733252deSDr. David Alan Gilbert trace_qemu_rdma_unregister_waiting_proc(chunk,
1378733252deSDr. David Alan Gilbert rdma->unregister_current);
1379329c9b10SDr. David Alan Gilbert
1380329c9b10SDr. David Alan Gilbert rdma->unregistrations[rdma->unregister_current] = 0;
1381329c9b10SDr. David Alan Gilbert rdma->unregister_current++;
1382329c9b10SDr. David Alan Gilbert
1383329c9b10SDr. David Alan Gilbert if (rdma->unregister_current == RDMA_SIGNALED_SEND_MAX) {
1384329c9b10SDr. David Alan Gilbert rdma->unregister_current = 0;
1385329c9b10SDr. David Alan Gilbert }
1386329c9b10SDr. David Alan Gilbert
1387329c9b10SDr. David Alan Gilbert
1388329c9b10SDr. David Alan Gilbert /*
1389329c9b10SDr. David Alan Gilbert * Unregistration is speculative (because migration is single-threaded
1390329c9b10SDr. David Alan Gilbert * and we cannot break the protocol's inifinband message ordering).
1391329c9b10SDr. David Alan Gilbert * Thus, if the memory is currently being used for transmission,
1392329c9b10SDr. David Alan Gilbert * then abort the attempt to unregister and try again
1393329c9b10SDr. David Alan Gilbert * later the next time a completion is received for this memory.
1394329c9b10SDr. David Alan Gilbert */
1395329c9b10SDr. David Alan Gilbert clear_bit(chunk, block->unregister_bitmap);
1396329c9b10SDr. David Alan Gilbert
1397329c9b10SDr. David Alan Gilbert if (test_bit(chunk, block->transit_bitmap)) {
1398733252deSDr. David Alan Gilbert trace_qemu_rdma_unregister_waiting_inflight(chunk);
1399329c9b10SDr. David Alan Gilbert continue;
1400329c9b10SDr. David Alan Gilbert }
1401329c9b10SDr. David Alan Gilbert
1402733252deSDr. David Alan Gilbert trace_qemu_rdma_unregister_waiting_send(chunk);
1403329c9b10SDr. David Alan Gilbert
1404329c9b10SDr. David Alan Gilbert ret = ibv_dereg_mr(block->pmr[chunk]);
1405329c9b10SDr. David Alan Gilbert block->pmr[chunk] = NULL;
1406329c9b10SDr. David Alan Gilbert block->remote_keys[chunk] = 0;
1407329c9b10SDr. David Alan Gilbert
1408329c9b10SDr. David Alan Gilbert if (ret != 0) {
1409ff4c9194SMarkus Armbruster error_report("unregistration chunk failed: %s",
1410ff4c9194SMarkus Armbruster strerror(ret));
14118c6513f7SMarkus Armbruster return -1;
1412329c9b10SDr. David Alan Gilbert }
1413329c9b10SDr. David Alan Gilbert rdma->total_registrations--;
1414329c9b10SDr. David Alan Gilbert
1415329c9b10SDr. David Alan Gilbert reg.key.chunk = chunk;
1416b12f7777SDr. David Alan Gilbert register_to_network(rdma, ®);
1417329c9b10SDr. David Alan Gilbert ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®,
1418c4c78dceSMarkus Armbruster &resp, NULL, NULL, &err);
1419329c9b10SDr. David Alan Gilbert if (ret < 0) {
1420c4c78dceSMarkus Armbruster error_report_err(err);
1421ec486974SMarkus Armbruster return -1;
1422329c9b10SDr. David Alan Gilbert }
1423329c9b10SDr. David Alan Gilbert
1424733252deSDr. David Alan Gilbert trace_qemu_rdma_unregister_waiting_complete(chunk);
1425329c9b10SDr. David Alan Gilbert }
1426329c9b10SDr. David Alan Gilbert
1427329c9b10SDr. David Alan Gilbert return 0;
1428329c9b10SDr. David Alan Gilbert }
1429329c9b10SDr. David Alan Gilbert
qemu_rdma_make_wrid(uint64_t wr_id,uint64_t index,uint64_t chunk)1430329c9b10SDr. David Alan Gilbert static uint64_t qemu_rdma_make_wrid(uint64_t wr_id, uint64_t index,
1431329c9b10SDr. David Alan Gilbert uint64_t chunk)
1432329c9b10SDr. David Alan Gilbert {
1433329c9b10SDr. David Alan Gilbert uint64_t result = wr_id & RDMA_WRID_TYPE_MASK;
1434329c9b10SDr. David Alan Gilbert
1435329c9b10SDr. David Alan Gilbert result |= (index << RDMA_WRID_BLOCK_SHIFT);
1436329c9b10SDr. David Alan Gilbert result |= (chunk << RDMA_WRID_CHUNK_SHIFT);
1437329c9b10SDr. David Alan Gilbert
1438329c9b10SDr. David Alan Gilbert return result;
1439329c9b10SDr. David Alan Gilbert }
1440329c9b10SDr. David Alan Gilbert
1441329c9b10SDr. David Alan Gilbert /*
1442329c9b10SDr. David Alan Gilbert * Consult the connection manager to see a work request
1443329c9b10SDr. David Alan Gilbert * (of any kind) has completed.
1444329c9b10SDr. David Alan Gilbert * Return the work request ID that completed.
1445329c9b10SDr. David Alan Gilbert */
qemu_rdma_poll(RDMAContext * rdma,struct ibv_cq * cq,uint64_t * wr_id_out,uint32_t * byte_len)1446b72eacf3SMarkus Armbruster static int qemu_rdma_poll(RDMAContext *rdma, struct ibv_cq *cq,
1447b390afd8SLi Zhijian uint64_t *wr_id_out, uint32_t *byte_len)
1448329c9b10SDr. David Alan Gilbert {
1449329c9b10SDr. David Alan Gilbert int ret;
1450329c9b10SDr. David Alan Gilbert struct ibv_wc wc;
1451329c9b10SDr. David Alan Gilbert uint64_t wr_id;
1452329c9b10SDr. David Alan Gilbert
1453b390afd8SLi Zhijian ret = ibv_poll_cq(cq, 1, &wc);
1454329c9b10SDr. David Alan Gilbert
1455329c9b10SDr. David Alan Gilbert if (!ret) {
1456329c9b10SDr. David Alan Gilbert *wr_id_out = RDMA_WRID_NONE;
1457329c9b10SDr. David Alan Gilbert return 0;
1458329c9b10SDr. David Alan Gilbert }
1459329c9b10SDr. David Alan Gilbert
1460329c9b10SDr. David Alan Gilbert if (ret < 0) {
1461ec486974SMarkus Armbruster return -1;
1462329c9b10SDr. David Alan Gilbert }
1463329c9b10SDr. David Alan Gilbert
1464329c9b10SDr. David Alan Gilbert wr_id = wc.wr_id & RDMA_WRID_TYPE_MASK;
1465329c9b10SDr. David Alan Gilbert
1466329c9b10SDr. David Alan Gilbert if (wc.status != IBV_WC_SUCCESS) {
1467329c9b10SDr. David Alan Gilbert return -1;
1468329c9b10SDr. David Alan Gilbert }
1469329c9b10SDr. David Alan Gilbert
1470329c9b10SDr. David Alan Gilbert if (rdma->control_ready_expected &&
1471329c9b10SDr. David Alan Gilbert (wr_id >= RDMA_WRID_RECV_CONTROL)) {
1472b5631d5bSMarkus Armbruster trace_qemu_rdma_poll_recv(wr_id - RDMA_WRID_RECV_CONTROL, wr_id,
1473b5631d5bSMarkus Armbruster rdma->nb_sent);
1474329c9b10SDr. David Alan Gilbert rdma->control_ready_expected = 0;
1475329c9b10SDr. David Alan Gilbert }
1476329c9b10SDr. David Alan Gilbert
1477329c9b10SDr. David Alan Gilbert if (wr_id == RDMA_WRID_RDMA_WRITE) {
1478329c9b10SDr. David Alan Gilbert uint64_t chunk =
1479329c9b10SDr. David Alan Gilbert (wc.wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
1480329c9b10SDr. David Alan Gilbert uint64_t index =
1481329c9b10SDr. David Alan Gilbert (wc.wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
1482329c9b10SDr. David Alan Gilbert RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
1483329c9b10SDr. David Alan Gilbert
1484b5631d5bSMarkus Armbruster trace_qemu_rdma_poll_write(wr_id, rdma->nb_sent,
1485fbce8c25SStefan Weil index, chunk, block->local_host_addr,
1486fbce8c25SStefan Weil (void *)(uintptr_t)block->remote_host_addr);
1487329c9b10SDr. David Alan Gilbert
1488329c9b10SDr. David Alan Gilbert clear_bit(chunk, block->transit_bitmap);
1489329c9b10SDr. David Alan Gilbert
1490329c9b10SDr. David Alan Gilbert if (rdma->nb_sent > 0) {
1491329c9b10SDr. David Alan Gilbert rdma->nb_sent--;
1492329c9b10SDr. David Alan Gilbert }
1493329c9b10SDr. David Alan Gilbert } else {
1494b5631d5bSMarkus Armbruster trace_qemu_rdma_poll_other(wr_id, rdma->nb_sent);
1495329c9b10SDr. David Alan Gilbert }
1496329c9b10SDr. David Alan Gilbert
1497329c9b10SDr. David Alan Gilbert *wr_id_out = wc.wr_id;
1498329c9b10SDr. David Alan Gilbert if (byte_len) {
1499329c9b10SDr. David Alan Gilbert *byte_len = wc.byte_len;
1500329c9b10SDr. David Alan Gilbert }
1501329c9b10SDr. David Alan Gilbert
1502329c9b10SDr. David Alan Gilbert return 0;
1503329c9b10SDr. David Alan Gilbert }
1504329c9b10SDr. David Alan Gilbert
15059c98cfbeSDr. David Alan Gilbert /* Wait for activity on the completion channel.
15069c98cfbeSDr. David Alan Gilbert * Returns 0 on success, none-0 on error.
15079c98cfbeSDr. David Alan Gilbert */
qemu_rdma_wait_comp_channel(RDMAContext * rdma,struct ibv_comp_channel * comp_channel)1508b390afd8SLi Zhijian static int qemu_rdma_wait_comp_channel(RDMAContext *rdma,
1509b390afd8SLi Zhijian struct ibv_comp_channel *comp_channel)
15109c98cfbeSDr. David Alan Gilbert {
1511d5882995SLidong Chen struct rdma_cm_event *cm_event;
1512d5882995SLidong Chen
15139c98cfbeSDr. David Alan Gilbert /*
15149c98cfbeSDr. David Alan Gilbert * Coroutine doesn't start until migration_fd_process_incoming()
15159c98cfbeSDr. David Alan Gilbert * so don't yield unless we know we're running inside of a coroutine.
15169c98cfbeSDr. David Alan Gilbert */
1517f5627c2aSLidong Chen if (rdma->migration_started_on_destination &&
1518f5627c2aSLidong Chen migration_incoming_get_current()->state == MIGRATION_STATUS_ACTIVE) {
1519b390afd8SLi Zhijian yield_until_fd_readable(comp_channel->fd);
15209c98cfbeSDr. David Alan Gilbert } else {
15219c98cfbeSDr. David Alan Gilbert /* This is the source side, we're in a separate thread
15229c98cfbeSDr. David Alan Gilbert * or destination prior to migration_fd_process_incoming()
15233a4452d8Szhaolichang * after postcopy, the destination also in a separate thread.
15249c98cfbeSDr. David Alan Gilbert * we can't yield; so we have to poll the fd.
15259c98cfbeSDr. David Alan Gilbert * But we need to be able to handle 'cancel' or an error
15269c98cfbeSDr. David Alan Gilbert * without hanging forever.
15279c98cfbeSDr. David Alan Gilbert */
1528b86c94a4SMarkus Armbruster while (!rdma->errored && !rdma->received_error) {
1529d5882995SLidong Chen GPollFD pfds[2];
1530b390afd8SLi Zhijian pfds[0].fd = comp_channel->fd;
15319c98cfbeSDr. David Alan Gilbert pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
1532d5882995SLidong Chen pfds[0].revents = 0;
1533d5882995SLidong Chen
1534d5882995SLidong Chen pfds[1].fd = rdma->channel->fd;
1535d5882995SLidong Chen pfds[1].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
1536d5882995SLidong Chen pfds[1].revents = 0;
1537d5882995SLidong Chen
15389c98cfbeSDr. David Alan Gilbert /* 0.1s timeout, should be fine for a 'cancel' */
1539d5882995SLidong Chen switch (qemu_poll_ns(pfds, 2, 100 * 1000 * 1000)) {
1540d5882995SLidong Chen case 2:
15419c98cfbeSDr. David Alan Gilbert case 1: /* fd active */
1542d5882995SLidong Chen if (pfds[0].revents) {
15439c98cfbeSDr. David Alan Gilbert return 0;
1544d5882995SLidong Chen }
1545d5882995SLidong Chen
1546d5882995SLidong Chen if (pfds[1].revents) {
15478f5a7faaSJuan Quintela if (rdma_get_cm_event(rdma->channel, &cm_event) < 0) {
15488c6513f7SMarkus Armbruster return -1;
1549d5882995SLidong Chen }
1550d5882995SLidong Chen
1551d5882995SLidong Chen if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
1552d5882995SLidong Chen cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
15536b8c2eb5SLi Zhijian rdma_ack_cm_event(cm_event);
15548c6513f7SMarkus Armbruster return -1;
1555d5882995SLidong Chen }
15566b8c2eb5SLi Zhijian rdma_ack_cm_event(cm_event);
1557d5882995SLidong Chen }
1558d5882995SLidong Chen break;
15599c98cfbeSDr. David Alan Gilbert
15609c98cfbeSDr. David Alan Gilbert case 0: /* Timeout, go around again */
15619c98cfbeSDr. David Alan Gilbert break;
15629c98cfbeSDr. David Alan Gilbert
15639c98cfbeSDr. David Alan Gilbert default: /* Error of some type -
15649c98cfbeSDr. David Alan Gilbert * I don't trust errno from qemu_poll_ns
15659c98cfbeSDr. David Alan Gilbert */
15668c6513f7SMarkus Armbruster return -1;
15679c98cfbeSDr. David Alan Gilbert }
15689c98cfbeSDr. David Alan Gilbert
15699c98cfbeSDr. David Alan Gilbert if (migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) {
15709c98cfbeSDr. David Alan Gilbert /* Bail out and let the cancellation happen */
15718c6513f7SMarkus Armbruster return -1;
15729c98cfbeSDr. David Alan Gilbert }
15739c98cfbeSDr. David Alan Gilbert }
15749c98cfbeSDr. David Alan Gilbert }
15759c98cfbeSDr. David Alan Gilbert
15769c98cfbeSDr. David Alan Gilbert if (rdma->received_error) {
15778c6513f7SMarkus Armbruster return -1;
15789c98cfbeSDr. David Alan Gilbert }
1579b86c94a4SMarkus Armbruster return -rdma->errored;
15809c98cfbeSDr. David Alan Gilbert }
15819c98cfbeSDr. David Alan Gilbert
to_channel(RDMAContext * rdma,uint64_t wrid)158287a24ca3SMarkus Armbruster static struct ibv_comp_channel *to_channel(RDMAContext *rdma, uint64_t wrid)
1583b390afd8SLi Zhijian {
1584b390afd8SLi Zhijian return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_comp_channel :
1585b390afd8SLi Zhijian rdma->recv_comp_channel;
1586b390afd8SLi Zhijian }
1587b390afd8SLi Zhijian
to_cq(RDMAContext * rdma,uint64_t wrid)158887a24ca3SMarkus Armbruster static struct ibv_cq *to_cq(RDMAContext *rdma, uint64_t wrid)
1589b390afd8SLi Zhijian {
1590b390afd8SLi Zhijian return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_cq : rdma->recv_cq;
1591b390afd8SLi Zhijian }
1592b390afd8SLi Zhijian
1593329c9b10SDr. David Alan Gilbert /*
1594329c9b10SDr. David Alan Gilbert * Block until the next work request has completed.
1595329c9b10SDr. David Alan Gilbert *
1596329c9b10SDr. David Alan Gilbert * First poll to see if a work request has already completed,
1597329c9b10SDr. David Alan Gilbert * otherwise block.
1598329c9b10SDr. David Alan Gilbert *
1599329c9b10SDr. David Alan Gilbert * If we encounter completed work requests for IDs other than
1600329c9b10SDr. David Alan Gilbert * the one we're interested in, then that's generally an error.
1601329c9b10SDr. David Alan Gilbert *
1602329c9b10SDr. David Alan Gilbert * The only exception is actual RDMA Write completions. These
1603329c9b10SDr. David Alan Gilbert * completions only need to be recorded, but do not actually
1604329c9b10SDr. David Alan Gilbert * need further processing.
1605329c9b10SDr. David Alan Gilbert */
qemu_rdma_block_for_wrid(RDMAContext * rdma,uint64_t wrid_requested,uint32_t * byte_len)160687a24ca3SMarkus Armbruster static int qemu_rdma_block_for_wrid(RDMAContext *rdma,
160787a24ca3SMarkus Armbruster uint64_t wrid_requested,
1608329c9b10SDr. David Alan Gilbert uint32_t *byte_len)
1609329c9b10SDr. David Alan Gilbert {
1610c0d77702SMarkus Armbruster int num_cq_events = 0, ret;
1611329c9b10SDr. David Alan Gilbert struct ibv_cq *cq;
1612329c9b10SDr. David Alan Gilbert void *cq_ctx;
1613329c9b10SDr. David Alan Gilbert uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
1614b390afd8SLi Zhijian struct ibv_comp_channel *ch = to_channel(rdma, wrid_requested);
1615b390afd8SLi Zhijian struct ibv_cq *poll_cq = to_cq(rdma, wrid_requested);
1616329c9b10SDr. David Alan Gilbert
1617b390afd8SLi Zhijian if (ibv_req_notify_cq(poll_cq, 0)) {
1618329c9b10SDr. David Alan Gilbert return -1;
1619329c9b10SDr. David Alan Gilbert }
1620329c9b10SDr. David Alan Gilbert /* poll cq first */
1621329c9b10SDr. David Alan Gilbert while (wr_id != wrid_requested) {
1622b390afd8SLi Zhijian ret = qemu_rdma_poll(rdma, poll_cq, &wr_id_in, byte_len);
1623329c9b10SDr. David Alan Gilbert if (ret < 0) {
1624ec486974SMarkus Armbruster return -1;
1625329c9b10SDr. David Alan Gilbert }
1626329c9b10SDr. David Alan Gilbert
1627329c9b10SDr. David Alan Gilbert wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
1628329c9b10SDr. David Alan Gilbert
1629329c9b10SDr. David Alan Gilbert if (wr_id == RDMA_WRID_NONE) {
1630329c9b10SDr. David Alan Gilbert break;
1631329c9b10SDr. David Alan Gilbert }
1632329c9b10SDr. David Alan Gilbert if (wr_id != wrid_requested) {
1633b5631d5bSMarkus Armbruster trace_qemu_rdma_block_for_wrid_miss(wrid_requested, wr_id);
1634329c9b10SDr. David Alan Gilbert }
1635329c9b10SDr. David Alan Gilbert }
1636329c9b10SDr. David Alan Gilbert
1637329c9b10SDr. David Alan Gilbert if (wr_id == wrid_requested) {
1638329c9b10SDr. David Alan Gilbert return 0;
1639329c9b10SDr. David Alan Gilbert }
1640329c9b10SDr. David Alan Gilbert
1641329c9b10SDr. David Alan Gilbert while (1) {
1642b390afd8SLi Zhijian ret = qemu_rdma_wait_comp_channel(rdma, ch);
16434a102179SMarkus Armbruster if (ret < 0) {
16449c98cfbeSDr. David Alan Gilbert goto err_block_for_wrid;
1645329c9b10SDr. David Alan Gilbert }
1646329c9b10SDr. David Alan Gilbert
1647b390afd8SLi Zhijian ret = ibv_get_cq_event(ch, &cq, &cq_ctx);
16484a102179SMarkus Armbruster if (ret < 0) {
1649329c9b10SDr. David Alan Gilbert goto err_block_for_wrid;
1650329c9b10SDr. David Alan Gilbert }
1651329c9b10SDr. David Alan Gilbert
1652329c9b10SDr. David Alan Gilbert num_cq_events++;
1653329c9b10SDr. David Alan Gilbert
1654c0d77702SMarkus Armbruster if (ibv_req_notify_cq(cq, 0)) {
1655329c9b10SDr. David Alan Gilbert goto err_block_for_wrid;
1656329c9b10SDr. David Alan Gilbert }
1657329c9b10SDr. David Alan Gilbert
1658329c9b10SDr. David Alan Gilbert while (wr_id != wrid_requested) {
1659b390afd8SLi Zhijian ret = qemu_rdma_poll(rdma, poll_cq, &wr_id_in, byte_len);
1660329c9b10SDr. David Alan Gilbert if (ret < 0) {
1661329c9b10SDr. David Alan Gilbert goto err_block_for_wrid;
1662329c9b10SDr. David Alan Gilbert }
1663329c9b10SDr. David Alan Gilbert
1664329c9b10SDr. David Alan Gilbert wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
1665329c9b10SDr. David Alan Gilbert
1666329c9b10SDr. David Alan Gilbert if (wr_id == RDMA_WRID_NONE) {
1667329c9b10SDr. David Alan Gilbert break;
1668329c9b10SDr. David Alan Gilbert }
1669329c9b10SDr. David Alan Gilbert if (wr_id != wrid_requested) {
1670b5631d5bSMarkus Armbruster trace_qemu_rdma_block_for_wrid_miss(wrid_requested, wr_id);
1671329c9b10SDr. David Alan Gilbert }
1672329c9b10SDr. David Alan Gilbert }
1673329c9b10SDr. David Alan Gilbert
1674329c9b10SDr. David Alan Gilbert if (wr_id == wrid_requested) {
1675329c9b10SDr. David Alan Gilbert goto success_block_for_wrid;
1676329c9b10SDr. David Alan Gilbert }
1677329c9b10SDr. David Alan Gilbert }
1678329c9b10SDr. David Alan Gilbert
1679329c9b10SDr. David Alan Gilbert success_block_for_wrid:
1680329c9b10SDr. David Alan Gilbert if (num_cq_events) {
1681329c9b10SDr. David Alan Gilbert ibv_ack_cq_events(cq, num_cq_events);
1682329c9b10SDr. David Alan Gilbert }
1683329c9b10SDr. David Alan Gilbert return 0;
1684329c9b10SDr. David Alan Gilbert
1685329c9b10SDr. David Alan Gilbert err_block_for_wrid:
1686329c9b10SDr. David Alan Gilbert if (num_cq_events) {
1687329c9b10SDr. David Alan Gilbert ibv_ack_cq_events(cq, num_cq_events);
1688329c9b10SDr. David Alan Gilbert }
16890b3c15f0SDr. David Alan Gilbert
1690b86c94a4SMarkus Armbruster rdma->errored = true;
1691ec486974SMarkus Armbruster return -1;
1692329c9b10SDr. David Alan Gilbert }
1693329c9b10SDr. David Alan Gilbert
1694329c9b10SDr. David Alan Gilbert /*
1695329c9b10SDr. David Alan Gilbert * Post a SEND message work request for the control channel
1696329c9b10SDr. David Alan Gilbert * containing some data and block until the post completes.
1697329c9b10SDr. David Alan Gilbert */
qemu_rdma_post_send_control(RDMAContext * rdma,uint8_t * buf,RDMAControlHeader * head,Error ** errp)1698329c9b10SDr. David Alan Gilbert static int qemu_rdma_post_send_control(RDMAContext *rdma, uint8_t *buf,
1699f3805964SMarkus Armbruster RDMAControlHeader *head,
1700f3805964SMarkus Armbruster Error **errp)
1701329c9b10SDr. David Alan Gilbert {
1702c0d77702SMarkus Armbruster int ret;
1703329c9b10SDr. David Alan Gilbert RDMAWorkRequestData *wr = &rdma->wr_data[RDMA_WRID_CONTROL];
1704329c9b10SDr. David Alan Gilbert struct ibv_send_wr *bad_wr;
1705329c9b10SDr. David Alan Gilbert struct ibv_sge sge = {
1706fbce8c25SStefan Weil .addr = (uintptr_t)(wr->control),
1707329c9b10SDr. David Alan Gilbert .length = head->len + sizeof(RDMAControlHeader),
1708329c9b10SDr. David Alan Gilbert .lkey = wr->control_mr->lkey,
1709329c9b10SDr. David Alan Gilbert };
1710329c9b10SDr. David Alan Gilbert struct ibv_send_wr send_wr = {
1711329c9b10SDr. David Alan Gilbert .wr_id = RDMA_WRID_SEND_CONTROL,
1712329c9b10SDr. David Alan Gilbert .opcode = IBV_WR_SEND,
1713329c9b10SDr. David Alan Gilbert .send_flags = IBV_SEND_SIGNALED,
1714329c9b10SDr. David Alan Gilbert .sg_list = &sge,
1715329c9b10SDr. David Alan Gilbert .num_sge = 1,
1716329c9b10SDr. David Alan Gilbert };
1717329c9b10SDr. David Alan Gilbert
1718482a33c5SDr. David Alan Gilbert trace_qemu_rdma_post_send_control(control_desc(head->type));
1719329c9b10SDr. David Alan Gilbert
1720329c9b10SDr. David Alan Gilbert /*
1721329c9b10SDr. David Alan Gilbert * We don't actually need to do a memcpy() in here if we used
1722329c9b10SDr. David Alan Gilbert * the "sge" properly, but since we're only sending control messages
1723329c9b10SDr. David Alan Gilbert * (not RAM in a performance-critical path), then its OK for now.
1724329c9b10SDr. David Alan Gilbert *
1725329c9b10SDr. David Alan Gilbert * The copy makes the RDMAControlHeader simpler to manipulate
1726329c9b10SDr. David Alan Gilbert * for the time being.
1727329c9b10SDr. David Alan Gilbert */
1728329c9b10SDr. David Alan Gilbert assert(head->len <= RDMA_CONTROL_MAX_BUFFER - sizeof(*head));
1729329c9b10SDr. David Alan Gilbert memcpy(wr->control, head, sizeof(RDMAControlHeader));
1730329c9b10SDr. David Alan Gilbert control_to_network((void *) wr->control);
1731329c9b10SDr. David Alan Gilbert
1732329c9b10SDr. David Alan Gilbert if (buf) {
1733329c9b10SDr. David Alan Gilbert memcpy(wr->control + sizeof(RDMAControlHeader), buf, head->len);
1734329c9b10SDr. David Alan Gilbert }
1735329c9b10SDr. David Alan Gilbert
1736329c9b10SDr. David Alan Gilbert
1737329c9b10SDr. David Alan Gilbert ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
1738329c9b10SDr. David Alan Gilbert
1739329c9b10SDr. David Alan Gilbert if (ret > 0) {
1740f3805964SMarkus Armbruster error_setg(errp, "Failed to use post IB SEND for control");
17418c6513f7SMarkus Armbruster return -1;
1742329c9b10SDr. David Alan Gilbert }
1743329c9b10SDr. David Alan Gilbert
1744329c9b10SDr. David Alan Gilbert ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_SEND_CONTROL, NULL);
1745329c9b10SDr. David Alan Gilbert if (ret < 0) {
1746f3805964SMarkus Armbruster error_setg(errp, "rdma migration: send polling control error");
1747ec486974SMarkus Armbruster return -1;
1748329c9b10SDr. David Alan Gilbert }
1749329c9b10SDr. David Alan Gilbert
1750ec486974SMarkus Armbruster return 0;
1751329c9b10SDr. David Alan Gilbert }
1752329c9b10SDr. David Alan Gilbert
1753329c9b10SDr. David Alan Gilbert /*
1754329c9b10SDr. David Alan Gilbert * Post a RECV work request in anticipation of some future receipt
1755329c9b10SDr. David Alan Gilbert * of data on the control channel.
1756329c9b10SDr. David Alan Gilbert */
qemu_rdma_post_recv_control(RDMAContext * rdma,int idx,Error ** errp)17573c0c3ebaSMarkus Armbruster static int qemu_rdma_post_recv_control(RDMAContext *rdma, int idx,
17583c0c3ebaSMarkus Armbruster Error **errp)
1759329c9b10SDr. David Alan Gilbert {
1760329c9b10SDr. David Alan Gilbert struct ibv_recv_wr *bad_wr;
1761329c9b10SDr. David Alan Gilbert struct ibv_sge sge = {
1762fbce8c25SStefan Weil .addr = (uintptr_t)(rdma->wr_data[idx].control),
1763329c9b10SDr. David Alan Gilbert .length = RDMA_CONTROL_MAX_BUFFER,
1764329c9b10SDr. David Alan Gilbert .lkey = rdma->wr_data[idx].control_mr->lkey,
1765329c9b10SDr. David Alan Gilbert };
1766329c9b10SDr. David Alan Gilbert
1767329c9b10SDr. David Alan Gilbert struct ibv_recv_wr recv_wr = {
1768329c9b10SDr. David Alan Gilbert .wr_id = RDMA_WRID_RECV_CONTROL + idx,
1769329c9b10SDr. David Alan Gilbert .sg_list = &sge,
1770329c9b10SDr. David Alan Gilbert .num_sge = 1,
1771329c9b10SDr. David Alan Gilbert };
1772329c9b10SDr. David Alan Gilbert
1773329c9b10SDr. David Alan Gilbert
1774329c9b10SDr. David Alan Gilbert if (ibv_post_recv(rdma->qp, &recv_wr, &bad_wr)) {
17753c0c3ebaSMarkus Armbruster error_setg(errp, "error posting control recv");
1776329c9b10SDr. David Alan Gilbert return -1;
1777329c9b10SDr. David Alan Gilbert }
1778329c9b10SDr. David Alan Gilbert
1779329c9b10SDr. David Alan Gilbert return 0;
1780329c9b10SDr. David Alan Gilbert }
1781329c9b10SDr. David Alan Gilbert
1782329c9b10SDr. David Alan Gilbert /*
1783329c9b10SDr. David Alan Gilbert * Block and wait for a RECV control channel message to arrive.
1784329c9b10SDr. David Alan Gilbert */
qemu_rdma_exchange_get_response(RDMAContext * rdma,RDMAControlHeader * head,uint32_t expecting,int idx,Error ** errp)1785329c9b10SDr. David Alan Gilbert static int qemu_rdma_exchange_get_response(RDMAContext *rdma,
17863765ec1fSMarkus Armbruster RDMAControlHeader *head, uint32_t expecting, int idx,
17873765ec1fSMarkus Armbruster Error **errp)
1788329c9b10SDr. David Alan Gilbert {
1789329c9b10SDr. David Alan Gilbert uint32_t byte_len;
1790329c9b10SDr. David Alan Gilbert int ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RECV_CONTROL + idx,
1791329c9b10SDr. David Alan Gilbert &byte_len);
1792329c9b10SDr. David Alan Gilbert
1793329c9b10SDr. David Alan Gilbert if (ret < 0) {
17943765ec1fSMarkus Armbruster error_setg(errp, "rdma migration: recv polling control error!");
1795ec486974SMarkus Armbruster return -1;
1796329c9b10SDr. David Alan Gilbert }
1797329c9b10SDr. David Alan Gilbert
1798329c9b10SDr. David Alan Gilbert network_to_control((void *) rdma->wr_data[idx].control);
1799329c9b10SDr. David Alan Gilbert memcpy(head, rdma->wr_data[idx].control, sizeof(RDMAControlHeader));
1800329c9b10SDr. David Alan Gilbert
1801482a33c5SDr. David Alan Gilbert trace_qemu_rdma_exchange_get_response_start(control_desc(expecting));
1802329c9b10SDr. David Alan Gilbert
1803329c9b10SDr. David Alan Gilbert if (expecting == RDMA_CONTROL_NONE) {
1804482a33c5SDr. David Alan Gilbert trace_qemu_rdma_exchange_get_response_none(control_desc(head->type),
1805733252deSDr. David Alan Gilbert head->type);
1806329c9b10SDr. David Alan Gilbert } else if (head->type != expecting || head->type == RDMA_CONTROL_ERROR) {
18073765ec1fSMarkus Armbruster error_setg(errp, "Was expecting a %s (%d) control message"
1808733252deSDr. David Alan Gilbert ", but got: %s (%d), length: %d",
1809482a33c5SDr. David Alan Gilbert control_desc(expecting), expecting,
1810482a33c5SDr. David Alan Gilbert control_desc(head->type), head->type, head->len);
1811cd5ea070SDr. David Alan Gilbert if (head->type == RDMA_CONTROL_ERROR) {
1812cd5ea070SDr. David Alan Gilbert rdma->received_error = true;
1813cd5ea070SDr. David Alan Gilbert }
18148c6513f7SMarkus Armbruster return -1;
1815329c9b10SDr. David Alan Gilbert }
1816329c9b10SDr. David Alan Gilbert if (head->len > RDMA_CONTROL_MAX_BUFFER - sizeof(*head)) {
18173765ec1fSMarkus Armbruster error_setg(errp, "too long length: %d", head->len);
18188c6513f7SMarkus Armbruster return -1;
1819329c9b10SDr. David Alan Gilbert }
1820329c9b10SDr. David Alan Gilbert if (sizeof(*head) + head->len != byte_len) {
18213765ec1fSMarkus Armbruster error_setg(errp, "Malformed length: %d byte_len %d",
18223765ec1fSMarkus Armbruster head->len, byte_len);
18238c6513f7SMarkus Armbruster return -1;
1824329c9b10SDr. David Alan Gilbert }
1825329c9b10SDr. David Alan Gilbert
1826329c9b10SDr. David Alan Gilbert return 0;
1827329c9b10SDr. David Alan Gilbert }
1828329c9b10SDr. David Alan Gilbert
1829329c9b10SDr. David Alan Gilbert /*
1830329c9b10SDr. David Alan Gilbert * When a RECV work request has completed, the work request's
1831329c9b10SDr. David Alan Gilbert * buffer is pointed at the header.
1832329c9b10SDr. David Alan Gilbert *
1833329c9b10SDr. David Alan Gilbert * This will advance the pointer to the data portion
1834329c9b10SDr. David Alan Gilbert * of the control message of the work request's buffer that
1835329c9b10SDr. David Alan Gilbert * was populated after the work request finished.
1836329c9b10SDr. David Alan Gilbert */
qemu_rdma_move_header(RDMAContext * rdma,int idx,RDMAControlHeader * head)1837329c9b10SDr. David Alan Gilbert static void qemu_rdma_move_header(RDMAContext *rdma, int idx,
1838329c9b10SDr. David Alan Gilbert RDMAControlHeader *head)
1839329c9b10SDr. David Alan Gilbert {
1840329c9b10SDr. David Alan Gilbert rdma->wr_data[idx].control_len = head->len;
1841329c9b10SDr. David Alan Gilbert rdma->wr_data[idx].control_curr =
1842329c9b10SDr. David Alan Gilbert rdma->wr_data[idx].control + sizeof(RDMAControlHeader);
1843329c9b10SDr. David Alan Gilbert }
1844329c9b10SDr. David Alan Gilbert
1845329c9b10SDr. David Alan Gilbert /*
1846329c9b10SDr. David Alan Gilbert * This is an 'atomic' high-level operation to deliver a single, unified
1847329c9b10SDr. David Alan Gilbert * control-channel message.
1848329c9b10SDr. David Alan Gilbert *
1849329c9b10SDr. David Alan Gilbert * Additionally, if the user is expecting some kind of reply to this message,
1850329c9b10SDr. David Alan Gilbert * they can request a 'resp' response message be filled in by posting an
1851329c9b10SDr. David Alan Gilbert * additional work request on behalf of the user and waiting for an additional
1852329c9b10SDr. David Alan Gilbert * completion.
1853329c9b10SDr. David Alan Gilbert *
1854329c9b10SDr. David Alan Gilbert * The extra (optional) response is used during registration to us from having
1855329c9b10SDr. David Alan Gilbert * to perform an *additional* exchange of message just to provide a response by
1856329c9b10SDr. David Alan Gilbert * instead piggy-backing on the acknowledgement.
1857329c9b10SDr. David Alan Gilbert */
qemu_rdma_exchange_send(RDMAContext * rdma,RDMAControlHeader * head,uint8_t * data,RDMAControlHeader * resp,int * resp_idx,int (* callback)(RDMAContext * rdma,Error ** errp),Error ** errp)1858329c9b10SDr. David Alan Gilbert static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
1859329c9b10SDr. David Alan Gilbert uint8_t *data, RDMAControlHeader *resp,
1860329c9b10SDr. David Alan Gilbert int *resp_idx,
1861de1aa35fSMarkus Armbruster int (*callback)(RDMAContext *rdma,
1862de1aa35fSMarkus Armbruster Error **errp),
1863c4c78dceSMarkus Armbruster Error **errp)
1864329c9b10SDr. David Alan Gilbert {
1865c0d77702SMarkus Armbruster int ret;
1866329c9b10SDr. David Alan Gilbert
1867329c9b10SDr. David Alan Gilbert /*
1868329c9b10SDr. David Alan Gilbert * Wait until the dest is ready before attempting to deliver the message
1869329c9b10SDr. David Alan Gilbert * by waiting for a READY message.
1870329c9b10SDr. David Alan Gilbert */
1871329c9b10SDr. David Alan Gilbert if (rdma->control_ready_expected) {
18727f3de3f0SMarkus Armbruster RDMAControlHeader resp_ignored;
18737f3de3f0SMarkus Armbruster
18747f3de3f0SMarkus Armbruster ret = qemu_rdma_exchange_get_response(rdma, &resp_ignored,
18757f3de3f0SMarkus Armbruster RDMA_CONTROL_READY,
18763765ec1fSMarkus Armbruster RDMA_WRID_READY, errp);
1877329c9b10SDr. David Alan Gilbert if (ret < 0) {
1878ec486974SMarkus Armbruster return -1;
1879329c9b10SDr. David Alan Gilbert }
1880329c9b10SDr. David Alan Gilbert }
1881329c9b10SDr. David Alan Gilbert
1882329c9b10SDr. David Alan Gilbert /*
1883329c9b10SDr. David Alan Gilbert * If the user is expecting a response, post a WR in anticipation of it.
1884329c9b10SDr. David Alan Gilbert */
1885329c9b10SDr. David Alan Gilbert if (resp) {
18863c0c3ebaSMarkus Armbruster ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_DATA, errp);
18874a102179SMarkus Armbruster if (ret < 0) {
1888ec486974SMarkus Armbruster return -1;
1889329c9b10SDr. David Alan Gilbert }
1890329c9b10SDr. David Alan Gilbert }
1891329c9b10SDr. David Alan Gilbert
1892329c9b10SDr. David Alan Gilbert /*
1893329c9b10SDr. David Alan Gilbert * Post a WR to replace the one we just consumed for the READY message.
1894329c9b10SDr. David Alan Gilbert */
18953c0c3ebaSMarkus Armbruster ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY, errp);
18964a102179SMarkus Armbruster if (ret < 0) {
1897ec486974SMarkus Armbruster return -1;
1898329c9b10SDr. David Alan Gilbert }
1899329c9b10SDr. David Alan Gilbert
1900329c9b10SDr. David Alan Gilbert /*
1901329c9b10SDr. David Alan Gilbert * Deliver the control message that was requested.
1902329c9b10SDr. David Alan Gilbert */
1903f3805964SMarkus Armbruster ret = qemu_rdma_post_send_control(rdma, data, head, errp);
1904329c9b10SDr. David Alan Gilbert
1905329c9b10SDr. David Alan Gilbert if (ret < 0) {
1906ec486974SMarkus Armbruster return -1;
1907329c9b10SDr. David Alan Gilbert }
1908329c9b10SDr. David Alan Gilbert
1909329c9b10SDr. David Alan Gilbert /*
1910329c9b10SDr. David Alan Gilbert * If we're expecting a response, block and wait for it.
1911329c9b10SDr. David Alan Gilbert */
1912329c9b10SDr. David Alan Gilbert if (resp) {
1913329c9b10SDr. David Alan Gilbert if (callback) {
1914733252deSDr. David Alan Gilbert trace_qemu_rdma_exchange_send_issue_callback();
1915de1aa35fSMarkus Armbruster ret = callback(rdma, errp);
1916329c9b10SDr. David Alan Gilbert if (ret < 0) {
1917ec486974SMarkus Armbruster return -1;
1918329c9b10SDr. David Alan Gilbert }
1919329c9b10SDr. David Alan Gilbert }
1920329c9b10SDr. David Alan Gilbert
1921482a33c5SDr. David Alan Gilbert trace_qemu_rdma_exchange_send_waiting(control_desc(resp->type));
1922329c9b10SDr. David Alan Gilbert ret = qemu_rdma_exchange_get_response(rdma, resp,
19233765ec1fSMarkus Armbruster resp->type, RDMA_WRID_DATA,
19243765ec1fSMarkus Armbruster errp);
1925329c9b10SDr. David Alan Gilbert
1926329c9b10SDr. David Alan Gilbert if (ret < 0) {
1927ec486974SMarkus Armbruster return -1;
1928329c9b10SDr. David Alan Gilbert }
1929329c9b10SDr. David Alan Gilbert
1930329c9b10SDr. David Alan Gilbert qemu_rdma_move_header(rdma, RDMA_WRID_DATA, resp);
1931329c9b10SDr. David Alan Gilbert if (resp_idx) {
1932329c9b10SDr. David Alan Gilbert *resp_idx = RDMA_WRID_DATA;
1933329c9b10SDr. David Alan Gilbert }
1934482a33c5SDr. David Alan Gilbert trace_qemu_rdma_exchange_send_received(control_desc(resp->type));
1935329c9b10SDr. David Alan Gilbert }
1936329c9b10SDr. David Alan Gilbert
1937329c9b10SDr. David Alan Gilbert rdma->control_ready_expected = 1;
1938329c9b10SDr. David Alan Gilbert
1939329c9b10SDr. David Alan Gilbert return 0;
1940329c9b10SDr. David Alan Gilbert }
1941329c9b10SDr. David Alan Gilbert
1942329c9b10SDr. David Alan Gilbert /*
1943329c9b10SDr. David Alan Gilbert * This is an 'atomic' high-level operation to receive a single, unified
1944329c9b10SDr. David Alan Gilbert * control-channel message.
1945329c9b10SDr. David Alan Gilbert */
qemu_rdma_exchange_recv(RDMAContext * rdma,RDMAControlHeader * head,uint32_t expecting,Error ** errp)1946329c9b10SDr. David Alan Gilbert static int qemu_rdma_exchange_recv(RDMAContext *rdma, RDMAControlHeader *head,
194796f363d8SMarkus Armbruster uint32_t expecting, Error **errp)
1948329c9b10SDr. David Alan Gilbert {
1949329c9b10SDr. David Alan Gilbert RDMAControlHeader ready = {
1950329c9b10SDr. David Alan Gilbert .len = 0,
1951329c9b10SDr. David Alan Gilbert .type = RDMA_CONTROL_READY,
1952329c9b10SDr. David Alan Gilbert .repeat = 1,
1953329c9b10SDr. David Alan Gilbert };
1954329c9b10SDr. David Alan Gilbert int ret;
1955329c9b10SDr. David Alan Gilbert
1956329c9b10SDr. David Alan Gilbert /*
1957329c9b10SDr. David Alan Gilbert * Inform the source that we're ready to receive a message.
1958329c9b10SDr. David Alan Gilbert */
1959f3805964SMarkus Armbruster ret = qemu_rdma_post_send_control(rdma, NULL, &ready, errp);
1960329c9b10SDr. David Alan Gilbert
1961329c9b10SDr. David Alan Gilbert if (ret < 0) {
1962ec486974SMarkus Armbruster return -1;
1963329c9b10SDr. David Alan Gilbert }
1964329c9b10SDr. David Alan Gilbert
1965329c9b10SDr. David Alan Gilbert /*
1966329c9b10SDr. David Alan Gilbert * Block and wait for the message.
1967329c9b10SDr. David Alan Gilbert */
1968329c9b10SDr. David Alan Gilbert ret = qemu_rdma_exchange_get_response(rdma, head,
19693765ec1fSMarkus Armbruster expecting, RDMA_WRID_READY, errp);
1970329c9b10SDr. David Alan Gilbert
1971329c9b10SDr. David Alan Gilbert if (ret < 0) {
1972ec486974SMarkus Armbruster return -1;
1973329c9b10SDr. David Alan Gilbert }
1974329c9b10SDr. David Alan Gilbert
1975329c9b10SDr. David Alan Gilbert qemu_rdma_move_header(rdma, RDMA_WRID_READY, head);
1976329c9b10SDr. David Alan Gilbert
1977329c9b10SDr. David Alan Gilbert /*
1978329c9b10SDr. David Alan Gilbert * Post a new RECV work request to replace the one we just consumed.
1979329c9b10SDr. David Alan Gilbert */
19803c0c3ebaSMarkus Armbruster ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY, errp);
19814a102179SMarkus Armbruster if (ret < 0) {
1982ec486974SMarkus Armbruster return -1;
1983329c9b10SDr. David Alan Gilbert }
1984329c9b10SDr. David Alan Gilbert
1985329c9b10SDr. David Alan Gilbert return 0;
1986329c9b10SDr. David Alan Gilbert }
1987329c9b10SDr. David Alan Gilbert
1988329c9b10SDr. David Alan Gilbert /*
1989329c9b10SDr. David Alan Gilbert * Write an actual chunk of memory using RDMA.
1990329c9b10SDr. David Alan Gilbert *
1991329c9b10SDr. David Alan Gilbert * If we're using dynamic registration on the dest-side, we have to
1992329c9b10SDr. David Alan Gilbert * send a registration command first.
1993329c9b10SDr. David Alan Gilbert */
qemu_rdma_write_one(RDMAContext * rdma,int current_index,uint64_t current_addr,uint64_t length,Error ** errp)1994e3378035SJuan Quintela static int qemu_rdma_write_one(RDMAContext *rdma,
1995329c9b10SDr. David Alan Gilbert int current_index, uint64_t current_addr,
1996557c34caSMarkus Armbruster uint64_t length, Error **errp)
1997329c9b10SDr. David Alan Gilbert {
1998329c9b10SDr. David Alan Gilbert struct ibv_sge sge;
1999329c9b10SDr. David Alan Gilbert struct ibv_send_wr send_wr = { 0 };
2000329c9b10SDr. David Alan Gilbert struct ibv_send_wr *bad_wr;
2001329c9b10SDr. David Alan Gilbert int reg_result_idx, ret, count = 0;
2002329c9b10SDr. David Alan Gilbert uint64_t chunk, chunks;
2003329c9b10SDr. David Alan Gilbert uint8_t *chunk_start, *chunk_end;
2004329c9b10SDr. David Alan Gilbert RDMALocalBlock *block = &(rdma->local_ram_blocks.block[current_index]);
2005329c9b10SDr. David Alan Gilbert RDMARegister reg;
2006329c9b10SDr. David Alan Gilbert RDMARegisterResult *reg_result;
2007329c9b10SDr. David Alan Gilbert RDMAControlHeader resp = { .type = RDMA_CONTROL_REGISTER_RESULT };
2008329c9b10SDr. David Alan Gilbert RDMAControlHeader head = { .len = sizeof(RDMARegister),
2009329c9b10SDr. David Alan Gilbert .type = RDMA_CONTROL_REGISTER_REQUEST,
2010329c9b10SDr. David Alan Gilbert .repeat = 1,
2011329c9b10SDr. David Alan Gilbert };
2012329c9b10SDr. David Alan Gilbert
2013329c9b10SDr. David Alan Gilbert retry:
2014fbce8c25SStefan Weil sge.addr = (uintptr_t)(block->local_host_addr +
2015329c9b10SDr. David Alan Gilbert (current_addr - block->offset));
2016329c9b10SDr. David Alan Gilbert sge.length = length;
2017329c9b10SDr. David Alan Gilbert
2018fbce8c25SStefan Weil chunk = ram_chunk_index(block->local_host_addr,
2019fbce8c25SStefan Weil (uint8_t *)(uintptr_t)sge.addr);
2020329c9b10SDr. David Alan Gilbert chunk_start = ram_chunk_start(block, chunk);
2021329c9b10SDr. David Alan Gilbert
2022329c9b10SDr. David Alan Gilbert if (block->is_ram_block) {
2023329c9b10SDr. David Alan Gilbert chunks = length / (1UL << RDMA_REG_CHUNK_SHIFT);
2024329c9b10SDr. David Alan Gilbert
2025329c9b10SDr. David Alan Gilbert if (chunks && ((length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
2026329c9b10SDr. David Alan Gilbert chunks--;
2027329c9b10SDr. David Alan Gilbert }
2028329c9b10SDr. David Alan Gilbert } else {
2029329c9b10SDr. David Alan Gilbert chunks = block->length / (1UL << RDMA_REG_CHUNK_SHIFT);
2030329c9b10SDr. David Alan Gilbert
2031329c9b10SDr. David Alan Gilbert if (chunks && ((block->length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
2032329c9b10SDr. David Alan Gilbert chunks--;
2033329c9b10SDr. David Alan Gilbert }
2034329c9b10SDr. David Alan Gilbert }
2035329c9b10SDr. David Alan Gilbert
2036733252deSDr. David Alan Gilbert trace_qemu_rdma_write_one_top(chunks + 1,
2037733252deSDr. David Alan Gilbert (chunks + 1) *
2038733252deSDr. David Alan Gilbert (1UL << RDMA_REG_CHUNK_SHIFT) / 1024 / 1024);
2039329c9b10SDr. David Alan Gilbert
2040329c9b10SDr. David Alan Gilbert chunk_end = ram_chunk_end(block, chunk + chunks);
2041329c9b10SDr. David Alan Gilbert
2042329c9b10SDr. David Alan Gilbert
2043329c9b10SDr. David Alan Gilbert while (test_bit(chunk, block->transit_bitmap)) {
2044329c9b10SDr. David Alan Gilbert (void)count;
2045733252deSDr. David Alan Gilbert trace_qemu_rdma_write_one_block(count++, current_index, chunk,
2046329c9b10SDr. David Alan Gilbert sge.addr, length, rdma->nb_sent, block->nb_chunks);
2047329c9b10SDr. David Alan Gilbert
2048329c9b10SDr. David Alan Gilbert ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2049329c9b10SDr. David Alan Gilbert
2050329c9b10SDr. David Alan Gilbert if (ret < 0) {
2051557c34caSMarkus Armbruster error_setg(errp, "Failed to Wait for previous write to complete "
2052329c9b10SDr. David Alan Gilbert "block %d chunk %" PRIu64
2053733252deSDr. David Alan Gilbert " current %" PRIu64 " len %" PRIu64 " %d",
2054329c9b10SDr. David Alan Gilbert current_index, chunk, sge.addr, length, rdma->nb_sent);
2055ec486974SMarkus Armbruster return -1;
2056329c9b10SDr. David Alan Gilbert }
2057329c9b10SDr. David Alan Gilbert }
2058329c9b10SDr. David Alan Gilbert
2059329c9b10SDr. David Alan Gilbert if (!rdma->pin_all || !block->is_ram_block) {
2060329c9b10SDr. David Alan Gilbert if (!block->remote_keys[chunk]) {
2061329c9b10SDr. David Alan Gilbert /*
2062329c9b10SDr. David Alan Gilbert * This chunk has not yet been registered, so first check to see
2063329c9b10SDr. David Alan Gilbert * if the entire chunk is zero. If so, tell the other size to
2064329c9b10SDr. David Alan Gilbert * memset() + madvise() the entire chunk without RDMA.
2065329c9b10SDr. David Alan Gilbert */
2066329c9b10SDr. David Alan Gilbert
2067a1febc49SRichard Henderson if (buffer_is_zero((void *)(uintptr_t)sge.addr, length)) {
2068329c9b10SDr. David Alan Gilbert RDMACompress comp = {
2069329c9b10SDr. David Alan Gilbert .offset = current_addr,
2070329c9b10SDr. David Alan Gilbert .value = 0,
2071329c9b10SDr. David Alan Gilbert .block_idx = current_index,
2072329c9b10SDr. David Alan Gilbert .length = length,
2073329c9b10SDr. David Alan Gilbert };
2074329c9b10SDr. David Alan Gilbert
2075329c9b10SDr. David Alan Gilbert head.len = sizeof(comp);
2076329c9b10SDr. David Alan Gilbert head.type = RDMA_CONTROL_COMPRESS;
2077329c9b10SDr. David Alan Gilbert
2078733252deSDr. David Alan Gilbert trace_qemu_rdma_write_one_zero(chunk, sge.length,
2079733252deSDr. David Alan Gilbert current_index, current_addr);
2080329c9b10SDr. David Alan Gilbert
2081b12f7777SDr. David Alan Gilbert compress_to_network(rdma, &comp);
2082329c9b10SDr. David Alan Gilbert ret = qemu_rdma_exchange_send(rdma, &head,
2083557c34caSMarkus Armbruster (uint8_t *) &comp, NULL, NULL, NULL, errp);
2084329c9b10SDr. David Alan Gilbert
2085329c9b10SDr. David Alan Gilbert if (ret < 0) {
20868c6513f7SMarkus Armbruster return -1;
2087329c9b10SDr. David Alan Gilbert }
2088329c9b10SDr. David Alan Gilbert
208967c31c9cSJuan Quintela /*
209067c31c9cSJuan Quintela * TODO: Here we are sending something, but we are not
209167c31c9cSJuan Quintela * accounting for anything transferred. The following is wrong:
209267c31c9cSJuan Quintela *
209367c31c9cSJuan Quintela * stat64_add(&mig_stats.rdma_bytes, sge.length);
209467c31c9cSJuan Quintela *
209567c31c9cSJuan Quintela * because we are using some kind of compression. I
209667c31c9cSJuan Quintela * would think that head.len would be the more similar
209767c31c9cSJuan Quintela * thing to a correct value.
209867c31c9cSJuan Quintela */
2099c61d2faaSJuan Quintela stat64_add(&mig_stats.zero_pages,
2100c61d2faaSJuan Quintela sge.length / qemu_target_page_size());
2101329c9b10SDr. David Alan Gilbert return 1;
2102329c9b10SDr. David Alan Gilbert }
2103329c9b10SDr. David Alan Gilbert
2104329c9b10SDr. David Alan Gilbert /*
2105329c9b10SDr. David Alan Gilbert * Otherwise, tell other side to register.
2106329c9b10SDr. David Alan Gilbert */
2107329c9b10SDr. David Alan Gilbert reg.current_index = current_index;
2108329c9b10SDr. David Alan Gilbert if (block->is_ram_block) {
2109329c9b10SDr. David Alan Gilbert reg.key.current_addr = current_addr;
2110329c9b10SDr. David Alan Gilbert } else {
2111329c9b10SDr. David Alan Gilbert reg.key.chunk = chunk;
2112329c9b10SDr. David Alan Gilbert }
2113329c9b10SDr. David Alan Gilbert reg.chunks = chunks;
2114329c9b10SDr. David Alan Gilbert
2115733252deSDr. David Alan Gilbert trace_qemu_rdma_write_one_sendreg(chunk, sge.length, current_index,
2116733252deSDr. David Alan Gilbert current_addr);
2117329c9b10SDr. David Alan Gilbert
2118b12f7777SDr. David Alan Gilbert register_to_network(rdma, ®);
2119329c9b10SDr. David Alan Gilbert ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®,
2120557c34caSMarkus Armbruster &resp, ®_result_idx, NULL, errp);
2121329c9b10SDr. David Alan Gilbert if (ret < 0) {
2122ec486974SMarkus Armbruster return -1;
2123329c9b10SDr. David Alan Gilbert }
2124329c9b10SDr. David Alan Gilbert
2125329c9b10SDr. David Alan Gilbert /* try to overlap this single registration with the one we sent. */
21263ac040c0SStefan Weil if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2127329c9b10SDr. David Alan Gilbert &sge.lkey, NULL, chunk,
2128329c9b10SDr. David Alan Gilbert chunk_start, chunk_end)) {
2129557c34caSMarkus Armbruster error_setg(errp, "cannot get lkey");
21308c6513f7SMarkus Armbruster return -1;
2131329c9b10SDr. David Alan Gilbert }
2132329c9b10SDr. David Alan Gilbert
2133329c9b10SDr. David Alan Gilbert reg_result = (RDMARegisterResult *)
2134329c9b10SDr. David Alan Gilbert rdma->wr_data[reg_result_idx].control_curr;
2135329c9b10SDr. David Alan Gilbert
2136329c9b10SDr. David Alan Gilbert network_to_result(reg_result);
2137329c9b10SDr. David Alan Gilbert
2138733252deSDr. David Alan Gilbert trace_qemu_rdma_write_one_recvregres(block->remote_keys[chunk],
2139733252deSDr. David Alan Gilbert reg_result->rkey, chunk);
2140329c9b10SDr. David Alan Gilbert
2141329c9b10SDr. David Alan Gilbert block->remote_keys[chunk] = reg_result->rkey;
2142329c9b10SDr. David Alan Gilbert block->remote_host_addr = reg_result->host_addr;
2143329c9b10SDr. David Alan Gilbert } else {
2144329c9b10SDr. David Alan Gilbert /* already registered before */
21453ac040c0SStefan Weil if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2146329c9b10SDr. David Alan Gilbert &sge.lkey, NULL, chunk,
2147329c9b10SDr. David Alan Gilbert chunk_start, chunk_end)) {
2148557c34caSMarkus Armbruster error_setg(errp, "cannot get lkey!");
21498c6513f7SMarkus Armbruster return -1;
2150329c9b10SDr. David Alan Gilbert }
2151329c9b10SDr. David Alan Gilbert }
2152329c9b10SDr. David Alan Gilbert
2153329c9b10SDr. David Alan Gilbert send_wr.wr.rdma.rkey = block->remote_keys[chunk];
2154329c9b10SDr. David Alan Gilbert } else {
2155329c9b10SDr. David Alan Gilbert send_wr.wr.rdma.rkey = block->remote_rkey;
2156329c9b10SDr. David Alan Gilbert
21573ac040c0SStefan Weil if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2158329c9b10SDr. David Alan Gilbert &sge.lkey, NULL, chunk,
2159329c9b10SDr. David Alan Gilbert chunk_start, chunk_end)) {
2160557c34caSMarkus Armbruster error_setg(errp, "cannot get lkey!");
21618c6513f7SMarkus Armbruster return -1;
2162329c9b10SDr. David Alan Gilbert }
2163329c9b10SDr. David Alan Gilbert }
2164329c9b10SDr. David Alan Gilbert
2165329c9b10SDr. David Alan Gilbert /*
2166329c9b10SDr. David Alan Gilbert * Encode the ram block index and chunk within this wrid.
2167329c9b10SDr. David Alan Gilbert * We will use this information at the time of completion
2168329c9b10SDr. David Alan Gilbert * to figure out which bitmap to check against and then which
2169329c9b10SDr. David Alan Gilbert * chunk in the bitmap to look for.
2170329c9b10SDr. David Alan Gilbert */
2171329c9b10SDr. David Alan Gilbert send_wr.wr_id = qemu_rdma_make_wrid(RDMA_WRID_RDMA_WRITE,
2172329c9b10SDr. David Alan Gilbert current_index, chunk);
2173329c9b10SDr. David Alan Gilbert
2174329c9b10SDr. David Alan Gilbert send_wr.opcode = IBV_WR_RDMA_WRITE;
2175329c9b10SDr. David Alan Gilbert send_wr.send_flags = IBV_SEND_SIGNALED;
2176329c9b10SDr. David Alan Gilbert send_wr.sg_list = &sge;
2177329c9b10SDr. David Alan Gilbert send_wr.num_sge = 1;
2178329c9b10SDr. David Alan Gilbert send_wr.wr.rdma.remote_addr = block->remote_host_addr +
2179329c9b10SDr. David Alan Gilbert (current_addr - block->offset);
2180329c9b10SDr. David Alan Gilbert
2181733252deSDr. David Alan Gilbert trace_qemu_rdma_write_one_post(chunk, sge.addr, send_wr.wr.rdma.remote_addr,
2182329c9b10SDr. David Alan Gilbert sge.length);
2183329c9b10SDr. David Alan Gilbert
2184329c9b10SDr. David Alan Gilbert /*
2185329c9b10SDr. David Alan Gilbert * ibv_post_send() does not return negative error numbers,
2186329c9b10SDr. David Alan Gilbert * per the specification they are positive - no idea why.
2187329c9b10SDr. David Alan Gilbert */
2188329c9b10SDr. David Alan Gilbert ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
2189329c9b10SDr. David Alan Gilbert
2190329c9b10SDr. David Alan Gilbert if (ret == ENOMEM) {
2191733252deSDr. David Alan Gilbert trace_qemu_rdma_write_one_queue_full();
2192329c9b10SDr. David Alan Gilbert ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2193329c9b10SDr. David Alan Gilbert if (ret < 0) {
2194557c34caSMarkus Armbruster error_setg(errp, "rdma migration: failed to make "
21951b6e1da6SMarkus Armbruster "room in full send queue!");
2196ec486974SMarkus Armbruster return -1;
2197329c9b10SDr. David Alan Gilbert }
2198329c9b10SDr. David Alan Gilbert
2199329c9b10SDr. David Alan Gilbert goto retry;
2200329c9b10SDr. David Alan Gilbert
2201329c9b10SDr. David Alan Gilbert } else if (ret > 0) {
2202557c34caSMarkus Armbruster error_setg_errno(errp, ret,
2203557c34caSMarkus Armbruster "rdma migration: post rdma write failed");
22048c6513f7SMarkus Armbruster return -1;
2205329c9b10SDr. David Alan Gilbert }
2206329c9b10SDr. David Alan Gilbert
2207329c9b10SDr. David Alan Gilbert set_bit(chunk, block->transit_bitmap);
22085690756dSJuan Quintela stat64_add(&mig_stats.normal_pages, sge.length / qemu_target_page_size());
220967c31c9cSJuan Quintela /*
221067c31c9cSJuan Quintela * We are adding to transferred the amount of data written, but no
2211e3fc6934SMichael Tokarev * overhead at all. I will assume that RDMA is magicaly and don't
221267c31c9cSJuan Quintela * need to transfer (at least) the addresses where it wants to
221367c31c9cSJuan Quintela * write the pages. Here it looks like it should be something
221467c31c9cSJuan Quintela * like:
221567c31c9cSJuan Quintela * sizeof(send_wr) + sge.length
221667c31c9cSJuan Quintela * but this being RDMA, who knows.
221767c31c9cSJuan Quintela */
221867c31c9cSJuan Quintela stat64_add(&mig_stats.rdma_bytes, sge.length);
22195690756dSJuan Quintela ram_transferred_add(sge.length);
2220329c9b10SDr. David Alan Gilbert rdma->total_writes++;
2221329c9b10SDr. David Alan Gilbert
2222329c9b10SDr. David Alan Gilbert return 0;
2223329c9b10SDr. David Alan Gilbert }
2224329c9b10SDr. David Alan Gilbert
2225329c9b10SDr. David Alan Gilbert /*
2226329c9b10SDr. David Alan Gilbert * Push out any unwritten RDMA operations.
2227329c9b10SDr. David Alan Gilbert *
2228329c9b10SDr. David Alan Gilbert * We support sending out multiple chunks at the same time.
2229329c9b10SDr. David Alan Gilbert * Not all of them need to get signaled in the completion queue.
2230329c9b10SDr. David Alan Gilbert */
qemu_rdma_write_flush(RDMAContext * rdma,Error ** errp)223156095477SMarkus Armbruster static int qemu_rdma_write_flush(RDMAContext *rdma, Error **errp)
2232329c9b10SDr. David Alan Gilbert {
2233329c9b10SDr. David Alan Gilbert int ret;
2234329c9b10SDr. David Alan Gilbert
2235329c9b10SDr. David Alan Gilbert if (!rdma->current_length) {
2236329c9b10SDr. David Alan Gilbert return 0;
2237329c9b10SDr. David Alan Gilbert }
2238329c9b10SDr. David Alan Gilbert
2239557c34caSMarkus Armbruster ret = qemu_rdma_write_one(rdma, rdma->current_index, rdma->current_addr,
2240557c34caSMarkus Armbruster rdma->current_length, errp);
2241329c9b10SDr. David Alan Gilbert
2242329c9b10SDr. David Alan Gilbert if (ret < 0) {
2243ec486974SMarkus Armbruster return -1;
2244329c9b10SDr. David Alan Gilbert }
2245329c9b10SDr. David Alan Gilbert
2246329c9b10SDr. David Alan Gilbert if (ret == 0) {
2247329c9b10SDr. David Alan Gilbert rdma->nb_sent++;
2248733252deSDr. David Alan Gilbert trace_qemu_rdma_write_flush(rdma->nb_sent);
2249329c9b10SDr. David Alan Gilbert }
2250329c9b10SDr. David Alan Gilbert
2251329c9b10SDr. David Alan Gilbert rdma->current_length = 0;
2252329c9b10SDr. David Alan Gilbert rdma->current_addr = 0;
2253329c9b10SDr. David Alan Gilbert
2254329c9b10SDr. David Alan Gilbert return 0;
2255329c9b10SDr. David Alan Gilbert }
2256329c9b10SDr. David Alan Gilbert
qemu_rdma_buffer_mergeable(RDMAContext * rdma,uint64_t offset,uint64_t len)22576a3792d7SMarkus Armbruster static inline bool qemu_rdma_buffer_mergeable(RDMAContext *rdma,
2258329c9b10SDr. David Alan Gilbert uint64_t offset, uint64_t len)
2259329c9b10SDr. David Alan Gilbert {
2260329c9b10SDr. David Alan Gilbert RDMALocalBlock *block;
2261329c9b10SDr. David Alan Gilbert uint8_t *host_addr;
2262329c9b10SDr. David Alan Gilbert uint8_t *chunk_end;
2263329c9b10SDr. David Alan Gilbert
2264329c9b10SDr. David Alan Gilbert if (rdma->current_index < 0) {
22656a3792d7SMarkus Armbruster return false;
2266329c9b10SDr. David Alan Gilbert }
2267329c9b10SDr. David Alan Gilbert
2268329c9b10SDr. David Alan Gilbert if (rdma->current_chunk < 0) {
22696a3792d7SMarkus Armbruster return false;
2270329c9b10SDr. David Alan Gilbert }
2271329c9b10SDr. David Alan Gilbert
2272329c9b10SDr. David Alan Gilbert block = &(rdma->local_ram_blocks.block[rdma->current_index]);
2273329c9b10SDr. David Alan Gilbert host_addr = block->local_host_addr + (offset - block->offset);
2274329c9b10SDr. David Alan Gilbert chunk_end = ram_chunk_end(block, rdma->current_chunk);
2275329c9b10SDr. David Alan Gilbert
2276329c9b10SDr. David Alan Gilbert if (rdma->current_length == 0) {
22776a3792d7SMarkus Armbruster return false;
2278329c9b10SDr. David Alan Gilbert }
2279329c9b10SDr. David Alan Gilbert
2280329c9b10SDr. David Alan Gilbert /*
2281329c9b10SDr. David Alan Gilbert * Only merge into chunk sequentially.
2282329c9b10SDr. David Alan Gilbert */
2283329c9b10SDr. David Alan Gilbert if (offset != (rdma->current_addr + rdma->current_length)) {
22846a3792d7SMarkus Armbruster return false;
2285329c9b10SDr. David Alan Gilbert }
2286329c9b10SDr. David Alan Gilbert
2287329c9b10SDr. David Alan Gilbert if (offset < block->offset) {
22886a3792d7SMarkus Armbruster return false;
2289329c9b10SDr. David Alan Gilbert }
2290329c9b10SDr. David Alan Gilbert
2291329c9b10SDr. David Alan Gilbert if ((offset + len) > (block->offset + block->length)) {
22926a3792d7SMarkus Armbruster return false;
2293329c9b10SDr. David Alan Gilbert }
2294329c9b10SDr. David Alan Gilbert
2295329c9b10SDr. David Alan Gilbert if ((host_addr + len) > chunk_end) {
22966a3792d7SMarkus Armbruster return false;
2297329c9b10SDr. David Alan Gilbert }
2298329c9b10SDr. David Alan Gilbert
22996a3792d7SMarkus Armbruster return true;
2300329c9b10SDr. David Alan Gilbert }
2301329c9b10SDr. David Alan Gilbert
2302329c9b10SDr. David Alan Gilbert /*
2303329c9b10SDr. David Alan Gilbert * We're not actually writing here, but doing three things:
2304329c9b10SDr. David Alan Gilbert *
2305329c9b10SDr. David Alan Gilbert * 1. Identify the chunk the buffer belongs to.
2306329c9b10SDr. David Alan Gilbert * 2. If the chunk is full or the buffer doesn't belong to the current
2307329c9b10SDr. David Alan Gilbert * chunk, then start a new chunk and flush() the old chunk.
2308329c9b10SDr. David Alan Gilbert * 3. To keep the hardware busy, we also group chunks into batches
2309329c9b10SDr. David Alan Gilbert * and only require that a batch gets acknowledged in the completion
23103a4452d8Szhaolichang * queue instead of each individual chunk.
2311329c9b10SDr. David Alan Gilbert */
qemu_rdma_write(RDMAContext * rdma,uint64_t block_offset,uint64_t offset,uint64_t len,Error ** errp)2312e3378035SJuan Quintela static int qemu_rdma_write(RDMAContext *rdma,
2313329c9b10SDr. David Alan Gilbert uint64_t block_offset, uint64_t offset,
2314446e559cSMarkus Armbruster uint64_t len, Error **errp)
2315329c9b10SDr. David Alan Gilbert {
2316329c9b10SDr. David Alan Gilbert uint64_t current_addr = block_offset + offset;
2317329c9b10SDr. David Alan Gilbert uint64_t index = rdma->current_index;
2318329c9b10SDr. David Alan Gilbert uint64_t chunk = rdma->current_chunk;
2319329c9b10SDr. David Alan Gilbert
2320329c9b10SDr. David Alan Gilbert /* If we cannot merge it, we flush the current buffer first. */
23216a3792d7SMarkus Armbruster if (!qemu_rdma_buffer_mergeable(rdma, current_addr, len)) {
23228f5a7faaSJuan Quintela if (qemu_rdma_write_flush(rdma, errp) < 0) {
2323ec486974SMarkus Armbruster return -1;
2324329c9b10SDr. David Alan Gilbert }
2325329c9b10SDr. David Alan Gilbert rdma->current_length = 0;
2326329c9b10SDr. David Alan Gilbert rdma->current_addr = current_addr;
2327329c9b10SDr. David Alan Gilbert
232887e6bdabSMarkus Armbruster qemu_rdma_search_ram_block(rdma, block_offset,
2329329c9b10SDr. David Alan Gilbert offset, len, &index, &chunk);
2330329c9b10SDr. David Alan Gilbert rdma->current_index = index;
2331329c9b10SDr. David Alan Gilbert rdma->current_chunk = chunk;
2332329c9b10SDr. David Alan Gilbert }
2333329c9b10SDr. David Alan Gilbert
2334329c9b10SDr. David Alan Gilbert /* merge it */
2335329c9b10SDr. David Alan Gilbert rdma->current_length += len;
2336329c9b10SDr. David Alan Gilbert
2337329c9b10SDr. David Alan Gilbert /* flush it if buffer is too large */
2338329c9b10SDr. David Alan Gilbert if (rdma->current_length >= RDMA_MERGE_MAX) {
2339446e559cSMarkus Armbruster return qemu_rdma_write_flush(rdma, errp);
2340329c9b10SDr. David Alan Gilbert }
2341329c9b10SDr. David Alan Gilbert
2342329c9b10SDr. David Alan Gilbert return 0;
2343329c9b10SDr. David Alan Gilbert }
2344329c9b10SDr. David Alan Gilbert
qemu_rdma_cleanup(RDMAContext * rdma)2345329c9b10SDr. David Alan Gilbert static void qemu_rdma_cleanup(RDMAContext *rdma)
2346329c9b10SDr. David Alan Gilbert {
2347f3805964SMarkus Armbruster Error *err = NULL;
2348329c9b10SDr. David Alan Gilbert
2349329c9b10SDr. David Alan Gilbert if (rdma->cm_id && rdma->connected) {
2350b86c94a4SMarkus Armbruster if ((rdma->errored ||
235132bce196SDr. David Alan Gilbert migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) &&
235232bce196SDr. David Alan Gilbert !rdma->received_error) {
2353329c9b10SDr. David Alan Gilbert RDMAControlHeader head = { .len = 0,
2354329c9b10SDr. David Alan Gilbert .type = RDMA_CONTROL_ERROR,
2355329c9b10SDr. David Alan Gilbert .repeat = 1,
2356329c9b10SDr. David Alan Gilbert };
23575cec563dSMarkus Armbruster warn_report("Early error. Sending error.");
2358f3805964SMarkus Armbruster if (qemu_rdma_post_send_control(rdma, NULL, &head, &err) < 0) {
23595cec563dSMarkus Armbruster warn_report_err(err);
2360f3805964SMarkus Armbruster }
2361329c9b10SDr. David Alan Gilbert }
2362329c9b10SDr. David Alan Gilbert
2363c5e76115SLidong Chen rdma_disconnect(rdma->cm_id);
2364733252deSDr. David Alan Gilbert trace_qemu_rdma_cleanup_disconnect();
2365329c9b10SDr. David Alan Gilbert rdma->connected = false;
2366329c9b10SDr. David Alan Gilbert }
2367329c9b10SDr. David Alan Gilbert
2368cf75e268SDr. David Alan Gilbert if (rdma->channel) {
2369fbbaacabSDr. David Alan Gilbert qemu_set_fd_handler(rdma->channel->fd, NULL, NULL, NULL);
2370cf75e268SDr. David Alan Gilbert }
2371a97270adSDr. David Alan Gilbert g_free(rdma->dest_blocks);
2372a97270adSDr. David Alan Gilbert rdma->dest_blocks = NULL;
2373329c9b10SDr. David Alan Gilbert
2374ebdb85f9SJuan Quintela for (int i = 0; i < RDMA_WRID_MAX; i++) {
2375ebdb85f9SJuan Quintela if (rdma->wr_data[i].control_mr) {
2376329c9b10SDr. David Alan Gilbert rdma->total_registrations--;
2377ebdb85f9SJuan Quintela ibv_dereg_mr(rdma->wr_data[i].control_mr);
2378329c9b10SDr. David Alan Gilbert }
2379ebdb85f9SJuan Quintela rdma->wr_data[i].control_mr = NULL;
2380329c9b10SDr. David Alan Gilbert }
2381329c9b10SDr. David Alan Gilbert
2382329c9b10SDr. David Alan Gilbert if (rdma->local_ram_blocks.block) {
2383329c9b10SDr. David Alan Gilbert while (rdma->local_ram_blocks.nb_blocks) {
238403fcab38SDr. David Alan Gilbert rdma_delete_block(rdma, &rdma->local_ram_blocks.block[0]);
2385329c9b10SDr. David Alan Gilbert }
2386329c9b10SDr. David Alan Gilbert }
2387329c9b10SDr. David Alan Gilbert
238880b262e1SPadmanabh Ratnakar if (rdma->qp) {
238980b262e1SPadmanabh Ratnakar rdma_destroy_qp(rdma->cm_id);
239080b262e1SPadmanabh Ratnakar rdma->qp = NULL;
239180b262e1SPadmanabh Ratnakar }
2392b390afd8SLi Zhijian if (rdma->recv_cq) {
2393b390afd8SLi Zhijian ibv_destroy_cq(rdma->recv_cq);
2394b390afd8SLi Zhijian rdma->recv_cq = NULL;
2395329c9b10SDr. David Alan Gilbert }
2396b390afd8SLi Zhijian if (rdma->send_cq) {
2397b390afd8SLi Zhijian ibv_destroy_cq(rdma->send_cq);
2398b390afd8SLi Zhijian rdma->send_cq = NULL;
2399b390afd8SLi Zhijian }
2400b390afd8SLi Zhijian if (rdma->recv_comp_channel) {
2401b390afd8SLi Zhijian ibv_destroy_comp_channel(rdma->recv_comp_channel);
2402b390afd8SLi Zhijian rdma->recv_comp_channel = NULL;
2403b390afd8SLi Zhijian }
2404b390afd8SLi Zhijian if (rdma->send_comp_channel) {
2405b390afd8SLi Zhijian ibv_destroy_comp_channel(rdma->send_comp_channel);
2406b390afd8SLi Zhijian rdma->send_comp_channel = NULL;
2407329c9b10SDr. David Alan Gilbert }
2408329c9b10SDr. David Alan Gilbert if (rdma->pd) {
2409329c9b10SDr. David Alan Gilbert ibv_dealloc_pd(rdma->pd);
2410329c9b10SDr. David Alan Gilbert rdma->pd = NULL;
2411329c9b10SDr. David Alan Gilbert }
241280b262e1SPadmanabh Ratnakar if (rdma->cm_id) {
241380b262e1SPadmanabh Ratnakar rdma_destroy_id(rdma->cm_id);
241480b262e1SPadmanabh Ratnakar rdma->cm_id = NULL;
241580b262e1SPadmanabh Ratnakar }
241655cc1b59SLidong Chen
241755cc1b59SLidong Chen /* the destination side, listen_id and channel is shared */
2418329c9b10SDr. David Alan Gilbert if (rdma->listen_id) {
241955cc1b59SLidong Chen if (!rdma->is_return_path) {
2420329c9b10SDr. David Alan Gilbert rdma_destroy_id(rdma->listen_id);
2421329c9b10SDr. David Alan Gilbert }
242255cc1b59SLidong Chen rdma->listen_id = NULL;
242355cc1b59SLidong Chen
242455cc1b59SLidong Chen if (rdma->channel) {
242555cc1b59SLidong Chen if (!rdma->is_return_path) {
242655cc1b59SLidong Chen rdma_destroy_event_channel(rdma->channel);
242755cc1b59SLidong Chen }
242855cc1b59SLidong Chen rdma->channel = NULL;
242955cc1b59SLidong Chen }
243055cc1b59SLidong Chen }
243155cc1b59SLidong Chen
2432329c9b10SDr. David Alan Gilbert if (rdma->channel) {
2433329c9b10SDr. David Alan Gilbert rdma_destroy_event_channel(rdma->channel);
2434329c9b10SDr. David Alan Gilbert rdma->channel = NULL;
2435329c9b10SDr. David Alan Gilbert }
2436329c9b10SDr. David Alan Gilbert g_free(rdma->host);
2437329c9b10SDr. David Alan Gilbert rdma->host = NULL;
2438329c9b10SDr. David Alan Gilbert }
2439329c9b10SDr. David Alan Gilbert
2440329c9b10SDr. David Alan Gilbert
qemu_rdma_source_init(RDMAContext * rdma,bool pin_all,Error ** errp)2441bbfb89e3SFam Zheng static int qemu_rdma_source_init(RDMAContext *rdma, bool pin_all, Error **errp)
2442329c9b10SDr. David Alan Gilbert {
2443ebdb85f9SJuan Quintela int ret;
2444329c9b10SDr. David Alan Gilbert
2445329c9b10SDr. David Alan Gilbert /*
2446329c9b10SDr. David Alan Gilbert * Will be validated against destination's actual capabilities
2447329c9b10SDr. David Alan Gilbert * after the connect() completes.
2448329c9b10SDr. David Alan Gilbert */
2449329c9b10SDr. David Alan Gilbert rdma->pin_all = pin_all;
2450329c9b10SDr. David Alan Gilbert
2451b16defbbSMarkus Armbruster ret = qemu_rdma_resolve_host(rdma, errp);
24524a102179SMarkus Armbruster if (ret < 0) {
2453329c9b10SDr. David Alan Gilbert goto err_rdma_source_init;
2454329c9b10SDr. David Alan Gilbert }
2455329c9b10SDr. David Alan Gilbert
245607d5b946SMarkus Armbruster ret = qemu_rdma_alloc_pd_cq(rdma, errp);
24574a102179SMarkus Armbruster if (ret < 0) {
2458329c9b10SDr. David Alan Gilbert goto err_rdma_source_init;
2459329c9b10SDr. David Alan Gilbert }
2460329c9b10SDr. David Alan Gilbert
2461329c9b10SDr. David Alan Gilbert ret = qemu_rdma_alloc_qp(rdma);
24624a102179SMarkus Armbruster if (ret < 0) {
24638fd471bdSMarkus Armbruster error_setg(errp, "RDMA ERROR: rdma migration: error allocating qp!");
2464329c9b10SDr. David Alan Gilbert goto err_rdma_source_init;
2465329c9b10SDr. David Alan Gilbert }
2466329c9b10SDr. David Alan Gilbert
24670610d7a1SMarkus Armbruster qemu_rdma_init_ram_blocks(rdma);
2468329c9b10SDr. David Alan Gilbert
2469760ff4beSDr. David Alan Gilbert /* Build the hash that maps from offset to RAMBlock */
2470760ff4beSDr. David Alan Gilbert rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal);
2471ebdb85f9SJuan Quintela for (int i = 0; i < rdma->local_ram_blocks.nb_blocks; i++) {
2472760ff4beSDr. David Alan Gilbert g_hash_table_insert(rdma->blockmap,
2473ebdb85f9SJuan Quintela (void *)(uintptr_t)rdma->local_ram_blocks.block[i].offset,
2474ebdb85f9SJuan Quintela &rdma->local_ram_blocks.block[i]);
2475760ff4beSDr. David Alan Gilbert }
2476760ff4beSDr. David Alan Gilbert
2477ebdb85f9SJuan Quintela for (int i = 0; i < RDMA_WRID_MAX; i++) {
2478ebdb85f9SJuan Quintela ret = qemu_rdma_reg_control(rdma, i);
24794a102179SMarkus Armbruster if (ret < 0) {
2480ebdb85f9SJuan Quintela error_setg(errp, "RDMA ERROR: rdma migration: error "
2481ebdb85f9SJuan Quintela "registering %d control!", i);
2482329c9b10SDr. David Alan Gilbert goto err_rdma_source_init;
2483329c9b10SDr. David Alan Gilbert }
2484329c9b10SDr. David Alan Gilbert }
2485329c9b10SDr. David Alan Gilbert
2486329c9b10SDr. David Alan Gilbert return 0;
2487329c9b10SDr. David Alan Gilbert
2488329c9b10SDr. David Alan Gilbert err_rdma_source_init:
2489329c9b10SDr. David Alan Gilbert qemu_rdma_cleanup(rdma);
2490329c9b10SDr. David Alan Gilbert return -1;
2491329c9b10SDr. David Alan Gilbert }
2492329c9b10SDr. David Alan Gilbert
qemu_get_cm_event_timeout(RDMAContext * rdma,struct rdma_cm_event ** cm_event,long msec,Error ** errp)2493e49e49ddSLi Zhijian static int qemu_get_cm_event_timeout(RDMAContext *rdma,
2494e49e49ddSLi Zhijian struct rdma_cm_event **cm_event,
2495e49e49ddSLi Zhijian long msec, Error **errp)
2496e49e49ddSLi Zhijian {
2497e49e49ddSLi Zhijian int ret;
2498e49e49ddSLi Zhijian struct pollfd poll_fd = {
2499e49e49ddSLi Zhijian .fd = rdma->channel->fd,
2500e49e49ddSLi Zhijian .events = POLLIN,
2501e49e49ddSLi Zhijian .revents = 0
2502e49e49ddSLi Zhijian };
2503e49e49ddSLi Zhijian
2504e49e49ddSLi Zhijian do {
2505e49e49ddSLi Zhijian ret = poll(&poll_fd, 1, msec);
2506e49e49ddSLi Zhijian } while (ret < 0 && errno == EINTR);
2507e49e49ddSLi Zhijian
2508e49e49ddSLi Zhijian if (ret == 0) {
25098fd471bdSMarkus Armbruster error_setg(errp, "RDMA ERROR: poll cm event timeout");
2510e49e49ddSLi Zhijian return -1;
2511e49e49ddSLi Zhijian } else if (ret < 0) {
25128fd471bdSMarkus Armbruster error_setg(errp, "RDMA ERROR: failed to poll cm event, errno=%i",
25138fd471bdSMarkus Armbruster errno);
2514e49e49ddSLi Zhijian return -1;
2515e49e49ddSLi Zhijian } else if (poll_fd.revents & POLLIN) {
2516f35c0d9bSMarkus Armbruster if (rdma_get_cm_event(rdma->channel, cm_event) < 0) {
25178fd471bdSMarkus Armbruster error_setg(errp, "RDMA ERROR: failed to get cm event");
2518f35c0d9bSMarkus Armbruster return -1;
2519f35c0d9bSMarkus Armbruster }
2520f35c0d9bSMarkus Armbruster return 0;
2521e49e49ddSLi Zhijian } else {
25228fd471bdSMarkus Armbruster error_setg(errp, "RDMA ERROR: no POLLIN event, revent=%x",
25238fd471bdSMarkus Armbruster poll_fd.revents);
2524e49e49ddSLi Zhijian return -1;
2525e49e49ddSLi Zhijian }
2526e49e49ddSLi Zhijian }
2527e49e49ddSLi Zhijian
qemu_rdma_connect(RDMAContext * rdma,bool return_path,Error ** errp)25283c03f21cSMarkus Armbruster static int qemu_rdma_connect(RDMAContext *rdma, bool return_path,
25293c03f21cSMarkus Armbruster Error **errp)
2530329c9b10SDr. David Alan Gilbert {
2531329c9b10SDr. David Alan Gilbert RDMACapabilities cap = {
2532329c9b10SDr. David Alan Gilbert .version = RDMA_CONTROL_VERSION_CURRENT,
2533329c9b10SDr. David Alan Gilbert .flags = 0,
2534329c9b10SDr. David Alan Gilbert };
2535329c9b10SDr. David Alan Gilbert struct rdma_conn_param conn_param = { .initiator_depth = 2,
2536329c9b10SDr. David Alan Gilbert .retry_count = 5,
2537329c9b10SDr. David Alan Gilbert .private_data = &cap,
2538329c9b10SDr. David Alan Gilbert .private_data_len = sizeof(cap),
2539329c9b10SDr. David Alan Gilbert };
2540329c9b10SDr. David Alan Gilbert struct rdma_cm_event *cm_event;
2541329c9b10SDr. David Alan Gilbert int ret;
2542329c9b10SDr. David Alan Gilbert
2543329c9b10SDr. David Alan Gilbert /*
2544329c9b10SDr. David Alan Gilbert * Only negotiate the capability with destination if the user
2545329c9b10SDr. David Alan Gilbert * on the source first requested the capability.
2546329c9b10SDr. David Alan Gilbert */
2547329c9b10SDr. David Alan Gilbert if (rdma->pin_all) {
2548733252deSDr. David Alan Gilbert trace_qemu_rdma_connect_pin_all_requested();
2549329c9b10SDr. David Alan Gilbert cap.flags |= RDMA_CAPABILITY_PIN_ALL;
2550329c9b10SDr. David Alan Gilbert }
2551329c9b10SDr. David Alan Gilbert
2552329c9b10SDr. David Alan Gilbert caps_to_network(&cap);
2553329c9b10SDr. David Alan Gilbert
25543c0c3ebaSMarkus Armbruster ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY, errp);
25554a102179SMarkus Armbruster if (ret < 0) {
25569cf2bab2SDr. David Alan Gilbert goto err_rdma_source_connect;
25579cf2bab2SDr. David Alan Gilbert }
25589cf2bab2SDr. David Alan Gilbert
2559329c9b10SDr. David Alan Gilbert ret = rdma_connect(rdma->cm_id, &conn_param);
25604a102179SMarkus Armbruster if (ret < 0) {
256135b1561eSMarkus Armbruster error_setg_errno(errp, errno,
256235b1561eSMarkus Armbruster "RDMA ERROR: connecting to destination!");
2563329c9b10SDr. David Alan Gilbert goto err_rdma_source_connect;
2564329c9b10SDr. David Alan Gilbert }
2565329c9b10SDr. David Alan Gilbert
2566e49e49ddSLi Zhijian if (return_path) {
2567e49e49ddSLi Zhijian ret = qemu_get_cm_event_timeout(rdma, &cm_event, 5000, errp);
2568e49e49ddSLi Zhijian } else {
2569329c9b10SDr. David Alan Gilbert ret = rdma_get_cm_event(rdma->channel, &cm_event);
2570f35c0d9bSMarkus Armbruster if (ret < 0) {
257135b1561eSMarkus Armbruster error_setg_errno(errp, errno,
257235b1561eSMarkus Armbruster "RDMA ERROR: failed to get cm event");
25738fd471bdSMarkus Armbruster }
2574f35c0d9bSMarkus Armbruster }
25754a102179SMarkus Armbruster if (ret < 0) {
2576329c9b10SDr. David Alan Gilbert goto err_rdma_source_connect;
2577329c9b10SDr. David Alan Gilbert }
2578329c9b10SDr. David Alan Gilbert
2579329c9b10SDr. David Alan Gilbert if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
25808fd471bdSMarkus Armbruster error_setg(errp, "RDMA ERROR: connecting to destination!");
2581329c9b10SDr. David Alan Gilbert rdma_ack_cm_event(cm_event);
2582329c9b10SDr. David Alan Gilbert goto err_rdma_source_connect;
2583329c9b10SDr. David Alan Gilbert }
2584329c9b10SDr. David Alan Gilbert rdma->connected = true;
2585329c9b10SDr. David Alan Gilbert
2586329c9b10SDr. David Alan Gilbert memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
2587329c9b10SDr. David Alan Gilbert network_to_caps(&cap);
2588329c9b10SDr. David Alan Gilbert
2589329c9b10SDr. David Alan Gilbert /*
2590329c9b10SDr. David Alan Gilbert * Verify that the *requested* capabilities are supported by the destination
2591329c9b10SDr. David Alan Gilbert * and disable them otherwise.
2592329c9b10SDr. David Alan Gilbert */
2593329c9b10SDr. David Alan Gilbert if (rdma->pin_all && !(cap.flags & RDMA_CAPABILITY_PIN_ALL)) {
2594e518b005SMarkus Armbruster warn_report("RDMA: Server cannot support pinning all memory. "
2595329c9b10SDr. David Alan Gilbert "Will register memory dynamically.");
2596329c9b10SDr. David Alan Gilbert rdma->pin_all = false;
2597329c9b10SDr. David Alan Gilbert }
2598329c9b10SDr. David Alan Gilbert
2599733252deSDr. David Alan Gilbert trace_qemu_rdma_connect_pin_all_outcome(rdma->pin_all);
2600329c9b10SDr. David Alan Gilbert
2601329c9b10SDr. David Alan Gilbert rdma_ack_cm_event(cm_event);
2602329c9b10SDr. David Alan Gilbert
2603329c9b10SDr. David Alan Gilbert rdma->control_ready_expected = 1;
2604329c9b10SDr. David Alan Gilbert rdma->nb_sent = 0;
2605329c9b10SDr. David Alan Gilbert return 0;
2606329c9b10SDr. David Alan Gilbert
2607329c9b10SDr. David Alan Gilbert err_rdma_source_connect:
2608329c9b10SDr. David Alan Gilbert qemu_rdma_cleanup(rdma);
2609329c9b10SDr. David Alan Gilbert return -1;
2610329c9b10SDr. David Alan Gilbert }
2611329c9b10SDr. David Alan Gilbert
qemu_rdma_dest_init(RDMAContext * rdma,Error ** errp)2612329c9b10SDr. David Alan Gilbert static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp)
2613329c9b10SDr. David Alan Gilbert {
2614071d5ae4SMarkus Armbruster Error *err = NULL;
2615ebdb85f9SJuan Quintela int ret;
2616329c9b10SDr. David Alan Gilbert struct rdma_cm_id *listen_id;
2617329c9b10SDr. David Alan Gilbert char ip[40] = "unknown";
26181dbd2fd9SMichael Tokarev struct rdma_addrinfo *res, *e;
2619329c9b10SDr. David Alan Gilbert char port_str[16];
2620f736e414SJack Wang int reuse = 1;
2621329c9b10SDr. David Alan Gilbert
2622ebdb85f9SJuan Quintela for (int i = 0; i < RDMA_WRID_MAX; i++) {
2623ebdb85f9SJuan Quintela rdma->wr_data[i].control_len = 0;
2624ebdb85f9SJuan Quintela rdma->wr_data[i].control_curr = NULL;
2625329c9b10SDr. David Alan Gilbert }
2626329c9b10SDr. David Alan Gilbert
26271dbd2fd9SMichael Tokarev if (!rdma->host || !rdma->host[0]) {
26288fd471bdSMarkus Armbruster error_setg(errp, "RDMA ERROR: RDMA host is not set!");
2629b86c94a4SMarkus Armbruster rdma->errored = true;
2630329c9b10SDr. David Alan Gilbert return -1;
2631329c9b10SDr. David Alan Gilbert }
2632329c9b10SDr. David Alan Gilbert /* create CM channel */
2633329c9b10SDr. David Alan Gilbert rdma->channel = rdma_create_event_channel();
2634329c9b10SDr. David Alan Gilbert if (!rdma->channel) {
26358fd471bdSMarkus Armbruster error_setg(errp, "RDMA ERROR: could not create rdma event channel");
2636b86c94a4SMarkus Armbruster rdma->errored = true;
2637329c9b10SDr. David Alan Gilbert return -1;
2638329c9b10SDr. David Alan Gilbert }
2639329c9b10SDr. David Alan Gilbert
2640329c9b10SDr. David Alan Gilbert /* create CM id */
2641329c9b10SDr. David Alan Gilbert ret = rdma_create_id(rdma->channel, &listen_id, NULL, RDMA_PS_TCP);
26424a102179SMarkus Armbruster if (ret < 0) {
26438fd471bdSMarkus Armbruster error_setg(errp, "RDMA ERROR: could not create cm_id!");
2644329c9b10SDr. David Alan Gilbert goto err_dest_init_create_listen_id;
2645329c9b10SDr. David Alan Gilbert }
2646329c9b10SDr. David Alan Gilbert
2647329c9b10SDr. David Alan Gilbert snprintf(port_str, 16, "%d", rdma->port);
2648329c9b10SDr. David Alan Gilbert port_str[15] = '\0';
2649329c9b10SDr. David Alan Gilbert
2650329c9b10SDr. David Alan Gilbert ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
265107249822SMarkus Armbruster if (ret) {
26528fd471bdSMarkus Armbruster error_setg(errp, "RDMA ERROR: could not rdma_getaddrinfo address %s",
26538fd471bdSMarkus Armbruster rdma->host);
2654329c9b10SDr. David Alan Gilbert goto err_dest_init_bind_addr;
2655329c9b10SDr. David Alan Gilbert }
2656329c9b10SDr. David Alan Gilbert
2657f736e414SJack Wang ret = rdma_set_option(listen_id, RDMA_OPTION_ID, RDMA_OPTION_ID_REUSEADDR,
2658f736e414SJack Wang &reuse, sizeof reuse);
26594a102179SMarkus Armbruster if (ret < 0) {
26608fd471bdSMarkus Armbruster error_setg(errp, "RDMA ERROR: Error: could not set REUSEADDR option");
2661f736e414SJack Wang goto err_dest_init_bind_addr;
2662f736e414SJack Wang }
2663071d5ae4SMarkus Armbruster
2664071d5ae4SMarkus Armbruster /* Try all addresses, saving the first error in @err */
2665329c9b10SDr. David Alan Gilbert for (e = res; e != NULL; e = e->ai_next) {
2666071d5ae4SMarkus Armbruster Error **local_errp = err ? NULL : &err;
2667071d5ae4SMarkus Armbruster
2668329c9b10SDr. David Alan Gilbert inet_ntop(e->ai_family,
2669329c9b10SDr. David Alan Gilbert &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
2670733252deSDr. David Alan Gilbert trace_qemu_rdma_dest_init_trying(rdma->host, ip);
2671329c9b10SDr. David Alan Gilbert ret = rdma_bind_addr(listen_id, e->ai_dst_addr);
26724a102179SMarkus Armbruster if (ret < 0) {
26731dbd2fd9SMichael Tokarev continue;
26741dbd2fd9SMichael Tokarev }
2675329c9b10SDr. David Alan Gilbert if (e->ai_family == AF_INET6) {
2676071d5ae4SMarkus Armbruster ret = qemu_rdma_broken_ipv6_kernel(listen_id->verbs,
2677071d5ae4SMarkus Armbruster local_errp);
26784a102179SMarkus Armbruster if (ret < 0) {
2679329c9b10SDr. David Alan Gilbert continue;
2680329c9b10SDr. David Alan Gilbert }
2681329c9b10SDr. David Alan Gilbert }
2682071d5ae4SMarkus Armbruster error_free(err);
26831dbd2fd9SMichael Tokarev break;
2684329c9b10SDr. David Alan Gilbert }
2685329c9b10SDr. David Alan Gilbert
2686f53b450aSLi Zhijian rdma_freeaddrinfo(res);
26871dbd2fd9SMichael Tokarev if (!e) {
2688071d5ae4SMarkus Armbruster if (err) {
2689071d5ae4SMarkus Armbruster error_propagate(errp, err);
2690071d5ae4SMarkus Armbruster } else {
26918fd471bdSMarkus Armbruster error_setg(errp, "RDMA ERROR: Error: could not rdma_bind_addr!");
26928fd471bdSMarkus Armbruster }
2693329c9b10SDr. David Alan Gilbert goto err_dest_init_bind_addr;
2694329c9b10SDr. David Alan Gilbert }
2695329c9b10SDr. David Alan Gilbert
2696329c9b10SDr. David Alan Gilbert rdma->listen_id = listen_id;
2697329c9b10SDr. David Alan Gilbert qemu_rdma_dump_gid("dest_init", listen_id);
2698329c9b10SDr. David Alan Gilbert return 0;
2699329c9b10SDr. David Alan Gilbert
2700329c9b10SDr. David Alan Gilbert err_dest_init_bind_addr:
2701329c9b10SDr. David Alan Gilbert rdma_destroy_id(listen_id);
2702329c9b10SDr. David Alan Gilbert err_dest_init_create_listen_id:
2703329c9b10SDr. David Alan Gilbert rdma_destroy_event_channel(rdma->channel);
2704329c9b10SDr. David Alan Gilbert rdma->channel = NULL;
2705b86c94a4SMarkus Armbruster rdma->errored = true;
270607249822SMarkus Armbruster return -1;
2707329c9b10SDr. David Alan Gilbert
2708329c9b10SDr. David Alan Gilbert }
2709329c9b10SDr. David Alan Gilbert
qemu_rdma_return_path_dest_init(RDMAContext * rdma_return_path,RDMAContext * rdma)271055cc1b59SLidong Chen static void qemu_rdma_return_path_dest_init(RDMAContext *rdma_return_path,
271155cc1b59SLidong Chen RDMAContext *rdma)
271255cc1b59SLidong Chen {
2713ebdb85f9SJuan Quintela for (int i = 0; i < RDMA_WRID_MAX; i++) {
2714ebdb85f9SJuan Quintela rdma_return_path->wr_data[i].control_len = 0;
2715ebdb85f9SJuan Quintela rdma_return_path->wr_data[i].control_curr = NULL;
271655cc1b59SLidong Chen }
271755cc1b59SLidong Chen
271855cc1b59SLidong Chen /*the CM channel and CM id is shared*/
271955cc1b59SLidong Chen rdma_return_path->channel = rdma->channel;
272055cc1b59SLidong Chen rdma_return_path->listen_id = rdma->listen_id;
272155cc1b59SLidong Chen
272255cc1b59SLidong Chen rdma->return_path = rdma_return_path;
272355cc1b59SLidong Chen rdma_return_path->return_path = rdma;
272455cc1b59SLidong Chen rdma_return_path->is_return_path = true;
272555cc1b59SLidong Chen }
272655cc1b59SLidong Chen
qemu_rdma_data_init(InetSocketAddress * saddr,Error ** errp)27273fa9642fSHet Gala static RDMAContext *qemu_rdma_data_init(InetSocketAddress *saddr, Error **errp)
2728329c9b10SDr. David Alan Gilbert {
2729329c9b10SDr. David Alan Gilbert RDMAContext *rdma = NULL;
2730329c9b10SDr. David Alan Gilbert
273197f3ad35SMarkus Armbruster rdma = g_new0(RDMAContext, 1);
2732329c9b10SDr. David Alan Gilbert rdma->current_index = -1;
2733329c9b10SDr. David Alan Gilbert rdma->current_chunk = -1;
2734329c9b10SDr. David Alan Gilbert
27353fa9642fSHet Gala rdma->host = g_strdup(saddr->host);
27363fa9642fSHet Gala rdma->port = atoi(saddr->port);
2737329c9b10SDr. David Alan Gilbert return rdma;
2738329c9b10SDr. David Alan Gilbert }
2739329c9b10SDr. David Alan Gilbert
2740329c9b10SDr. David Alan Gilbert /*
2741329c9b10SDr. David Alan Gilbert * QEMUFile interface to the control channel.
2742329c9b10SDr. David Alan Gilbert * SEND messages for control only.
2743329c9b10SDr. David Alan Gilbert * VM's ram is handled with regular RDMA messages.
2744329c9b10SDr. David Alan Gilbert */
qio_channel_rdma_writev(QIOChannel * ioc,const struct iovec * iov,size_t niov,int * fds,size_t nfds,int flags,Error ** errp)27456ddd2d76SDaniel P. Berrange static ssize_t qio_channel_rdma_writev(QIOChannel *ioc,
27466ddd2d76SDaniel P. Berrange const struct iovec *iov,
27476ddd2d76SDaniel P. Berrange size_t niov,
27486ddd2d76SDaniel P. Berrange int *fds,
27496ddd2d76SDaniel P. Berrange size_t nfds,
2750b88651cbSLeonardo Bras int flags,
27516ddd2d76SDaniel P. Berrange Error **errp)
2752329c9b10SDr. David Alan Gilbert {
27536ddd2d76SDaniel P. Berrange QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
275474637e6fSLidong Chen RDMAContext *rdma;
2755329c9b10SDr. David Alan Gilbert int ret;
27566ddd2d76SDaniel P. Berrange ssize_t done = 0;
275714e2fcbbSJuan Quintela size_t len;
2758329c9b10SDr. David Alan Gilbert
2759987ab2a5SDr. David Alan Gilbert RCU_READ_LOCK_GUARD();
2760d73415a3SStefan Hajnoczi rdma = qatomic_rcu_read(&rioc->rdmaout);
276174637e6fSLidong Chen
276274637e6fSLidong Chen if (!rdma) {
276374ecf6acSFiona Ebner error_setg(errp, "RDMA control channel output is not set");
276474ecf6acSFiona Ebner return -1;
276574637e6fSLidong Chen }
276674637e6fSLidong Chen
2767b86c94a4SMarkus Armbruster if (rdma->errored) {
27688e262e0bSMarkus Armbruster error_setg(errp,
27698e262e0bSMarkus Armbruster "RDMA is in an error state waiting migration to abort!");
27708e262e0bSMarkus Armbruster return -1;
27718e262e0bSMarkus Armbruster }
2772329c9b10SDr. David Alan Gilbert
2773329c9b10SDr. David Alan Gilbert /*
2774329c9b10SDr. David Alan Gilbert * Push out any writes that
2775329c9b10SDr. David Alan Gilbert * we're queued up for VM's ram.
2776329c9b10SDr. David Alan Gilbert */
277756095477SMarkus Armbruster ret = qemu_rdma_write_flush(rdma, errp);
2778329c9b10SDr. David Alan Gilbert if (ret < 0) {
2779b86c94a4SMarkus Armbruster rdma->errored = true;
278074ecf6acSFiona Ebner return -1;
2781329c9b10SDr. David Alan Gilbert }
2782329c9b10SDr. David Alan Gilbert
278314e2fcbbSJuan Quintela for (int i = 0; i < niov; i++) {
27846ddd2d76SDaniel P. Berrange size_t remaining = iov[i].iov_len;
27856ddd2d76SDaniel P. Berrange uint8_t * data = (void *)iov[i].iov_base;
2786329c9b10SDr. David Alan Gilbert while (remaining) {
27872ada4b63SLi Zhijian RDMAControlHeader head = {};
2788329c9b10SDr. David Alan Gilbert
2789f38f6d41SLidong Chen len = MIN(remaining, RDMA_SEND_INCREMENT);
2790f38f6d41SLidong Chen remaining -= len;
2791329c9b10SDr. David Alan Gilbert
2792f38f6d41SLidong Chen head.len = len;
2793329c9b10SDr. David Alan Gilbert head.type = RDMA_CONTROL_QEMU_FILE;
2794329c9b10SDr. David Alan Gilbert
2795c4c78dceSMarkus Armbruster ret = qemu_rdma_exchange_send(rdma, &head,
2796c4c78dceSMarkus Armbruster data, NULL, NULL, NULL, errp);
2797329c9b10SDr. David Alan Gilbert
2798329c9b10SDr. David Alan Gilbert if (ret < 0) {
2799b86c94a4SMarkus Armbruster rdma->errored = true;
280074ecf6acSFiona Ebner return -1;
2801329c9b10SDr. David Alan Gilbert }
2802329c9b10SDr. David Alan Gilbert
2803f38f6d41SLidong Chen data += len;
2804f38f6d41SLidong Chen done += len;
28056ddd2d76SDaniel P. Berrange }
2806329c9b10SDr. David Alan Gilbert }
2807329c9b10SDr. David Alan Gilbert
28086ddd2d76SDaniel P. Berrange return done;
2809329c9b10SDr. David Alan Gilbert }
2810329c9b10SDr. David Alan Gilbert
qemu_rdma_fill(RDMAContext * rdma,uint8_t * buf,size_t size,int idx)2811329c9b10SDr. David Alan Gilbert static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf,
2812a202a4c0SDr. David Alan Gilbert size_t size, int idx)
2813329c9b10SDr. David Alan Gilbert {
2814329c9b10SDr. David Alan Gilbert size_t len = 0;
2815329c9b10SDr. David Alan Gilbert
2816329c9b10SDr. David Alan Gilbert if (rdma->wr_data[idx].control_len) {
2817733252deSDr. David Alan Gilbert trace_qemu_rdma_fill(rdma->wr_data[idx].control_len, size);
2818329c9b10SDr. David Alan Gilbert
2819329c9b10SDr. David Alan Gilbert len = MIN(size, rdma->wr_data[idx].control_len);
2820329c9b10SDr. David Alan Gilbert memcpy(buf, rdma->wr_data[idx].control_curr, len);
2821329c9b10SDr. David Alan Gilbert rdma->wr_data[idx].control_curr += len;
2822329c9b10SDr. David Alan Gilbert rdma->wr_data[idx].control_len -= len;
2823329c9b10SDr. David Alan Gilbert }
2824329c9b10SDr. David Alan Gilbert
2825329c9b10SDr. David Alan Gilbert return len;
2826329c9b10SDr. David Alan Gilbert }
2827329c9b10SDr. David Alan Gilbert
2828329c9b10SDr. David Alan Gilbert /*
2829329c9b10SDr. David Alan Gilbert * QEMUFile interface to the control channel.
2830329c9b10SDr. David Alan Gilbert * RDMA links don't use bytestreams, so we have to
2831329c9b10SDr. David Alan Gilbert * return bytes to QEMUFile opportunistically.
2832329c9b10SDr. David Alan Gilbert */
qio_channel_rdma_readv(QIOChannel * ioc,const struct iovec * iov,size_t niov,int ** fds,size_t * nfds,int flags,Error ** errp)28336ddd2d76SDaniel P. Berrange static ssize_t qio_channel_rdma_readv(QIOChannel *ioc,
28346ddd2d76SDaniel P. Berrange const struct iovec *iov,
28356ddd2d76SDaniel P. Berrange size_t niov,
28366ddd2d76SDaniel P. Berrange int **fds,
28376ddd2d76SDaniel P. Berrange size_t *nfds,
283884615a19Smanish.mishra int flags,
28396ddd2d76SDaniel P. Berrange Error **errp)
2840329c9b10SDr. David Alan Gilbert {
28416ddd2d76SDaniel P. Berrange QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
284274637e6fSLidong Chen RDMAContext *rdma;
2843329c9b10SDr. David Alan Gilbert RDMAControlHeader head;
2844c0d77702SMarkus Armbruster int ret;
28458ff58b05SMarkus Armbruster ssize_t done = 0;
284614e2fcbbSJuan Quintela size_t len;
2847329c9b10SDr. David Alan Gilbert
2848987ab2a5SDr. David Alan Gilbert RCU_READ_LOCK_GUARD();
2849d73415a3SStefan Hajnoczi rdma = qatomic_rcu_read(&rioc->rdmain);
285074637e6fSLidong Chen
285174637e6fSLidong Chen if (!rdma) {
285274ecf6acSFiona Ebner error_setg(errp, "RDMA control channel input is not set");
285374ecf6acSFiona Ebner return -1;
285474637e6fSLidong Chen }
285574637e6fSLidong Chen
2856b86c94a4SMarkus Armbruster if (rdma->errored) {
28578e262e0bSMarkus Armbruster error_setg(errp,
28588e262e0bSMarkus Armbruster "RDMA is in an error state waiting migration to abort!");
28598e262e0bSMarkus Armbruster return -1;
28608e262e0bSMarkus Armbruster }
2861329c9b10SDr. David Alan Gilbert
286214e2fcbbSJuan Quintela for (int i = 0; i < niov; i++) {
28636ddd2d76SDaniel P. Berrange size_t want = iov[i].iov_len;
28646ddd2d76SDaniel P. Berrange uint8_t *data = (void *)iov[i].iov_base;
28656ddd2d76SDaniel P. Berrange
2866329c9b10SDr. David Alan Gilbert /*
2867329c9b10SDr. David Alan Gilbert * First, we hold on to the last SEND message we
2868329c9b10SDr. David Alan Gilbert * were given and dish out the bytes until we run
2869329c9b10SDr. David Alan Gilbert * out of bytes.
2870329c9b10SDr. David Alan Gilbert */
287125352b37SMarkus Armbruster len = qemu_rdma_fill(rdma, data, want, 0);
287225352b37SMarkus Armbruster done += len;
287325352b37SMarkus Armbruster want -= len;
28746ddd2d76SDaniel P. Berrange /* Got what we needed, so go to next iovec */
28756ddd2d76SDaniel P. Berrange if (want == 0) {
28766ddd2d76SDaniel P. Berrange continue;
2877329c9b10SDr. David Alan Gilbert }
2878329c9b10SDr. David Alan Gilbert
28796ddd2d76SDaniel P. Berrange /* If we got any data so far, then don't wait
28806ddd2d76SDaniel P. Berrange * for more, just return what we have */
28816ddd2d76SDaniel P. Berrange if (done > 0) {
28826ddd2d76SDaniel P. Berrange break;
28836ddd2d76SDaniel P. Berrange }
28846ddd2d76SDaniel P. Berrange
28856ddd2d76SDaniel P. Berrange
28866ddd2d76SDaniel P. Berrange /* We've got nothing at all, so lets wait for
28876ddd2d76SDaniel P. Berrange * more to arrive
2888329c9b10SDr. David Alan Gilbert */
288996f363d8SMarkus Armbruster ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_QEMU_FILE,
289096f363d8SMarkus Armbruster errp);
2891329c9b10SDr. David Alan Gilbert
2892329c9b10SDr. David Alan Gilbert if (ret < 0) {
2893b86c94a4SMarkus Armbruster rdma->errored = true;
289474ecf6acSFiona Ebner return -1;
2895329c9b10SDr. David Alan Gilbert }
2896329c9b10SDr. David Alan Gilbert
2897329c9b10SDr. David Alan Gilbert /*
2898329c9b10SDr. David Alan Gilbert * SEND was received with new bytes, now try again.
2899329c9b10SDr. David Alan Gilbert */
290025352b37SMarkus Armbruster len = qemu_rdma_fill(rdma, data, want, 0);
290125352b37SMarkus Armbruster done += len;
290225352b37SMarkus Armbruster want -= len;
29036ddd2d76SDaniel P. Berrange
29046ddd2d76SDaniel P. Berrange /* Still didn't get enough, so lets just return */
29056ddd2d76SDaniel P. Berrange if (want) {
29066ddd2d76SDaniel P. Berrange if (done == 0) {
29076ddd2d76SDaniel P. Berrange return QIO_CHANNEL_ERR_BLOCK;
29086ddd2d76SDaniel P. Berrange } else {
29096ddd2d76SDaniel P. Berrange break;
29106ddd2d76SDaniel P. Berrange }
29116ddd2d76SDaniel P. Berrange }
29126ddd2d76SDaniel P. Berrange }
2913f38f6d41SLidong Chen return done;
2914329c9b10SDr. David Alan Gilbert }
2915329c9b10SDr. David Alan Gilbert
2916329c9b10SDr. David Alan Gilbert /*
2917329c9b10SDr. David Alan Gilbert * Block until all the outstanding chunks have been delivered by the hardware.
2918329c9b10SDr. David Alan Gilbert */
qemu_rdma_drain_cq(RDMAContext * rdma)2919e3378035SJuan Quintela static int qemu_rdma_drain_cq(RDMAContext *rdma)
2920329c9b10SDr. David Alan Gilbert {
292156095477SMarkus Armbruster Error *err = NULL;
2922329c9b10SDr. David Alan Gilbert
292356095477SMarkus Armbruster if (qemu_rdma_write_flush(rdma, &err) < 0) {
292456095477SMarkus Armbruster error_report_err(err);
29258c6513f7SMarkus Armbruster return -1;
2926329c9b10SDr. David Alan Gilbert }
2927329c9b10SDr. David Alan Gilbert
2928329c9b10SDr. David Alan Gilbert while (rdma->nb_sent) {
29298f5a7faaSJuan Quintela if (qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL) < 0) {
2930733252deSDr. David Alan Gilbert error_report("rdma migration: complete polling error!");
29318c6513f7SMarkus Armbruster return -1;
2932329c9b10SDr. David Alan Gilbert }
2933329c9b10SDr. David Alan Gilbert }
2934329c9b10SDr. David Alan Gilbert
2935329c9b10SDr. David Alan Gilbert qemu_rdma_unregister_waiting(rdma);
2936329c9b10SDr. David Alan Gilbert
2937329c9b10SDr. David Alan Gilbert return 0;
2938329c9b10SDr. David Alan Gilbert }
2939329c9b10SDr. David Alan Gilbert
29406ddd2d76SDaniel P. Berrange
qio_channel_rdma_set_blocking(QIOChannel * ioc,bool blocking,Error ** errp)29416ddd2d76SDaniel P. Berrange static int qio_channel_rdma_set_blocking(QIOChannel *ioc,
29426ddd2d76SDaniel P. Berrange bool blocking,
29436ddd2d76SDaniel P. Berrange Error **errp)
2944329c9b10SDr. David Alan Gilbert {
29456ddd2d76SDaniel P. Berrange QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
29466ddd2d76SDaniel P. Berrange /* XXX we should make readv/writev actually honour this :-) */
29476ddd2d76SDaniel P. Berrange rioc->blocking = blocking;
29486ddd2d76SDaniel P. Berrange return 0;
2949329c9b10SDr. David Alan Gilbert }
29506ddd2d76SDaniel P. Berrange
29516ddd2d76SDaniel P. Berrange
29526ddd2d76SDaniel P. Berrange typedef struct QIOChannelRDMASource QIOChannelRDMASource;
29536ddd2d76SDaniel P. Berrange struct QIOChannelRDMASource {
29546ddd2d76SDaniel P. Berrange GSource parent;
29556ddd2d76SDaniel P. Berrange QIOChannelRDMA *rioc;
29566ddd2d76SDaniel P. Berrange GIOCondition condition;
29576ddd2d76SDaniel P. Berrange };
29586ddd2d76SDaniel P. Berrange
29596ddd2d76SDaniel P. Berrange static gboolean
qio_channel_rdma_source_prepare(GSource * source,gint * timeout)29606ddd2d76SDaniel P. Berrange qio_channel_rdma_source_prepare(GSource *source,
29616ddd2d76SDaniel P. Berrange gint *timeout)
29626ddd2d76SDaniel P. Berrange {
29636ddd2d76SDaniel P. Berrange QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
296474637e6fSLidong Chen RDMAContext *rdma;
29656ddd2d76SDaniel P. Berrange GIOCondition cond = 0;
29666ddd2d76SDaniel P. Berrange *timeout = -1;
29676ddd2d76SDaniel P. Berrange
2968987ab2a5SDr. David Alan Gilbert RCU_READ_LOCK_GUARD();
296974637e6fSLidong Chen if (rsource->condition == G_IO_IN) {
2970d73415a3SStefan Hajnoczi rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
297174637e6fSLidong Chen } else {
2972d73415a3SStefan Hajnoczi rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
297374637e6fSLidong Chen }
297474637e6fSLidong Chen
297574637e6fSLidong Chen if (!rdma) {
297674637e6fSLidong Chen error_report("RDMAContext is NULL when prepare Gsource");
297774637e6fSLidong Chen return FALSE;
297874637e6fSLidong Chen }
297974637e6fSLidong Chen
29806ddd2d76SDaniel P. Berrange if (rdma->wr_data[0].control_len) {
29816ddd2d76SDaniel P. Berrange cond |= G_IO_IN;
29826ddd2d76SDaniel P. Berrange }
29836ddd2d76SDaniel P. Berrange cond |= G_IO_OUT;
29846ddd2d76SDaniel P. Berrange
29856ddd2d76SDaniel P. Berrange return cond & rsource->condition;
29866ddd2d76SDaniel P. Berrange }
29876ddd2d76SDaniel P. Berrange
29886ddd2d76SDaniel P. Berrange static gboolean
qio_channel_rdma_source_check(GSource * source)29896ddd2d76SDaniel P. Berrange qio_channel_rdma_source_check(GSource *source)
29906ddd2d76SDaniel P. Berrange {
29916ddd2d76SDaniel P. Berrange QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
299274637e6fSLidong Chen RDMAContext *rdma;
29936ddd2d76SDaniel P. Berrange GIOCondition cond = 0;
29946ddd2d76SDaniel P. Berrange
2995987ab2a5SDr. David Alan Gilbert RCU_READ_LOCK_GUARD();
299674637e6fSLidong Chen if (rsource->condition == G_IO_IN) {
2997d73415a3SStefan Hajnoczi rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
299874637e6fSLidong Chen } else {
2999d73415a3SStefan Hajnoczi rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
300074637e6fSLidong Chen }
300174637e6fSLidong Chen
300274637e6fSLidong Chen if (!rdma) {
300374637e6fSLidong Chen error_report("RDMAContext is NULL when check Gsource");
300474637e6fSLidong Chen return FALSE;
300574637e6fSLidong Chen }
300674637e6fSLidong Chen
30076ddd2d76SDaniel P. Berrange if (rdma->wr_data[0].control_len) {
30086ddd2d76SDaniel P. Berrange cond |= G_IO_IN;
30096ddd2d76SDaniel P. Berrange }
30106ddd2d76SDaniel P. Berrange cond |= G_IO_OUT;
30116ddd2d76SDaniel P. Berrange
30126ddd2d76SDaniel P. Berrange return cond & rsource->condition;
30136ddd2d76SDaniel P. Berrange }
30146ddd2d76SDaniel P. Berrange
30156ddd2d76SDaniel P. Berrange static gboolean
qio_channel_rdma_source_dispatch(GSource * source,GSourceFunc callback,gpointer user_data)30166ddd2d76SDaniel P. Berrange qio_channel_rdma_source_dispatch(GSource *source,
30176ddd2d76SDaniel P. Berrange GSourceFunc callback,
30186ddd2d76SDaniel P. Berrange gpointer user_data)
30196ddd2d76SDaniel P. Berrange {
30206ddd2d76SDaniel P. Berrange QIOChannelFunc func = (QIOChannelFunc)callback;
30216ddd2d76SDaniel P. Berrange QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
302274637e6fSLidong Chen RDMAContext *rdma;
30236ddd2d76SDaniel P. Berrange GIOCondition cond = 0;
30246ddd2d76SDaniel P. Berrange
3025987ab2a5SDr. David Alan Gilbert RCU_READ_LOCK_GUARD();
302674637e6fSLidong Chen if (rsource->condition == G_IO_IN) {
3027d73415a3SStefan Hajnoczi rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
302874637e6fSLidong Chen } else {
3029d73415a3SStefan Hajnoczi rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
303074637e6fSLidong Chen }
303174637e6fSLidong Chen
303274637e6fSLidong Chen if (!rdma) {
303374637e6fSLidong Chen error_report("RDMAContext is NULL when dispatch Gsource");
303474637e6fSLidong Chen return FALSE;
303574637e6fSLidong Chen }
303674637e6fSLidong Chen
30376ddd2d76SDaniel P. Berrange if (rdma->wr_data[0].control_len) {
30386ddd2d76SDaniel P. Berrange cond |= G_IO_IN;
30396ddd2d76SDaniel P. Berrange }
30406ddd2d76SDaniel P. Berrange cond |= G_IO_OUT;
30416ddd2d76SDaniel P. Berrange
30426ddd2d76SDaniel P. Berrange return (*func)(QIO_CHANNEL(rsource->rioc),
30436ddd2d76SDaniel P. Berrange (cond & rsource->condition),
30446ddd2d76SDaniel P. Berrange user_data);
30456ddd2d76SDaniel P. Berrange }
30466ddd2d76SDaniel P. Berrange
30476ddd2d76SDaniel P. Berrange static void
qio_channel_rdma_source_finalize(GSource * source)30486ddd2d76SDaniel P. Berrange qio_channel_rdma_source_finalize(GSource *source)
30496ddd2d76SDaniel P. Berrange {
30506ddd2d76SDaniel P. Berrange QIOChannelRDMASource *ssource = (QIOChannelRDMASource *)source;
30516ddd2d76SDaniel P. Berrange
30526ddd2d76SDaniel P. Berrange object_unref(OBJECT(ssource->rioc));
30536ddd2d76SDaniel P. Berrange }
30546ddd2d76SDaniel P. Berrange
305536cc822dSMarkus Armbruster static GSourceFuncs qio_channel_rdma_source_funcs = {
30566ddd2d76SDaniel P. Berrange qio_channel_rdma_source_prepare,
30576ddd2d76SDaniel P. Berrange qio_channel_rdma_source_check,
30586ddd2d76SDaniel P. Berrange qio_channel_rdma_source_dispatch,
30596ddd2d76SDaniel P. Berrange qio_channel_rdma_source_finalize
30606ddd2d76SDaniel P. Berrange };
30616ddd2d76SDaniel P. Berrange
qio_channel_rdma_create_watch(QIOChannel * ioc,GIOCondition condition)30626ddd2d76SDaniel P. Berrange static GSource *qio_channel_rdma_create_watch(QIOChannel *ioc,
30636ddd2d76SDaniel P. Berrange GIOCondition condition)
30646ddd2d76SDaniel P. Berrange {
30656ddd2d76SDaniel P. Berrange QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
30666ddd2d76SDaniel P. Berrange QIOChannelRDMASource *ssource;
30676ddd2d76SDaniel P. Berrange GSource *source;
30686ddd2d76SDaniel P. Berrange
30696ddd2d76SDaniel P. Berrange source = g_source_new(&qio_channel_rdma_source_funcs,
30706ddd2d76SDaniel P. Berrange sizeof(QIOChannelRDMASource));
30716ddd2d76SDaniel P. Berrange ssource = (QIOChannelRDMASource *)source;
30726ddd2d76SDaniel P. Berrange
30736ddd2d76SDaniel P. Berrange ssource->rioc = rioc;
30746ddd2d76SDaniel P. Berrange object_ref(OBJECT(rioc));
30756ddd2d76SDaniel P. Berrange
30766ddd2d76SDaniel P. Berrange ssource->condition = condition;
30776ddd2d76SDaniel P. Berrange
30786ddd2d76SDaniel P. Berrange return source;
30796ddd2d76SDaniel P. Berrange }
30806ddd2d76SDaniel P. Berrange
qio_channel_rdma_set_aio_fd_handler(QIOChannel * ioc,AioContext * read_ctx,IOHandler * io_read,AioContext * write_ctx,IOHandler * io_write,void * opaque)30814d9f675bSLidong Chen static void qio_channel_rdma_set_aio_fd_handler(QIOChannel *ioc,
308206e0f098SStefan Hajnoczi AioContext *read_ctx,
30834d9f675bSLidong Chen IOHandler *io_read,
308406e0f098SStefan Hajnoczi AioContext *write_ctx,
30854d9f675bSLidong Chen IOHandler *io_write,
30864d9f675bSLidong Chen void *opaque)
30874d9f675bSLidong Chen {
30884d9f675bSLidong Chen QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
30894d9f675bSLidong Chen if (io_read) {
309006e0f098SStefan Hajnoczi aio_set_fd_handler(read_ctx, rioc->rdmain->recv_comp_channel->fd,
309106e0f098SStefan Hajnoczi io_read, io_write, NULL, NULL, opaque);
309206e0f098SStefan Hajnoczi aio_set_fd_handler(read_ctx, rioc->rdmain->send_comp_channel->fd,
309306e0f098SStefan Hajnoczi io_read, io_write, NULL, NULL, opaque);
30944d9f675bSLidong Chen } else {
309506e0f098SStefan Hajnoczi aio_set_fd_handler(write_ctx, rioc->rdmaout->recv_comp_channel->fd,
309606e0f098SStefan Hajnoczi io_read, io_write, NULL, NULL, opaque);
309706e0f098SStefan Hajnoczi aio_set_fd_handler(write_ctx, rioc->rdmaout->send_comp_channel->fd,
309806e0f098SStefan Hajnoczi io_read, io_write, NULL, NULL, opaque);
30994d9f675bSLidong Chen }
31004d9f675bSLidong Chen }
31016ddd2d76SDaniel P. Berrange
3102d46a4847SDr. David Alan Gilbert struct rdma_close_rcu {
3103d46a4847SDr. David Alan Gilbert struct rcu_head rcu;
3104d46a4847SDr. David Alan Gilbert RDMAContext *rdmain;
3105d46a4847SDr. David Alan Gilbert RDMAContext *rdmaout;
3106d46a4847SDr. David Alan Gilbert };
3107d46a4847SDr. David Alan Gilbert
3108d46a4847SDr. David Alan Gilbert /* callback from qio_channel_rdma_close via call_rcu */
qio_channel_rdma_close_rcu(struct rdma_close_rcu * rcu)3109d46a4847SDr. David Alan Gilbert static void qio_channel_rdma_close_rcu(struct rdma_close_rcu *rcu)
3110d46a4847SDr. David Alan Gilbert {
3111d46a4847SDr. David Alan Gilbert if (rcu->rdmain) {
3112d46a4847SDr. David Alan Gilbert qemu_rdma_cleanup(rcu->rdmain);
3113d46a4847SDr. David Alan Gilbert }
3114d46a4847SDr. David Alan Gilbert
3115d46a4847SDr. David Alan Gilbert if (rcu->rdmaout) {
3116d46a4847SDr. David Alan Gilbert qemu_rdma_cleanup(rcu->rdmaout);
3117d46a4847SDr. David Alan Gilbert }
3118d46a4847SDr. David Alan Gilbert
3119d46a4847SDr. David Alan Gilbert g_free(rcu->rdmain);
3120d46a4847SDr. David Alan Gilbert g_free(rcu->rdmaout);
3121d46a4847SDr. David Alan Gilbert g_free(rcu);
3122d46a4847SDr. David Alan Gilbert }
3123d46a4847SDr. David Alan Gilbert
qio_channel_rdma_close(QIOChannel * ioc,Error ** errp)31246ddd2d76SDaniel P. Berrange static int qio_channel_rdma_close(QIOChannel *ioc,
31256ddd2d76SDaniel P. Berrange Error **errp)
31266ddd2d76SDaniel P. Berrange {
31276ddd2d76SDaniel P. Berrange QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
312874637e6fSLidong Chen RDMAContext *rdmain, *rdmaout;
3129d46a4847SDr. David Alan Gilbert struct rdma_close_rcu *rcu = g_new(struct rdma_close_rcu, 1);
3130d46a4847SDr. David Alan Gilbert
31316ddd2d76SDaniel P. Berrange trace_qemu_rdma_close();
313274637e6fSLidong Chen
313374637e6fSLidong Chen rdmain = rioc->rdmain;
313474637e6fSLidong Chen if (rdmain) {
3135d73415a3SStefan Hajnoczi qatomic_rcu_set(&rioc->rdmain, NULL);
313612c67ffbSDr. David Alan Gilbert }
313774637e6fSLidong Chen
313874637e6fSLidong Chen rdmaout = rioc->rdmaout;
313974637e6fSLidong Chen if (rdmaout) {
3140d73415a3SStefan Hajnoczi qatomic_rcu_set(&rioc->rdmaout, NULL);
31416ddd2d76SDaniel P. Berrange }
314274637e6fSLidong Chen
3143d46a4847SDr. David Alan Gilbert rcu->rdmain = rdmain;
3144d46a4847SDr. David Alan Gilbert rcu->rdmaout = rdmaout;
3145d46a4847SDr. David Alan Gilbert call_rcu(rcu, qio_channel_rdma_close_rcu, rcu);
314674637e6fSLidong Chen
3147329c9b10SDr. David Alan Gilbert return 0;
3148329c9b10SDr. David Alan Gilbert }
3149329c9b10SDr. David Alan Gilbert
315054db882fSLidong Chen static int
qio_channel_rdma_shutdown(QIOChannel * ioc,QIOChannelShutdown how,Error ** errp)315154db882fSLidong Chen qio_channel_rdma_shutdown(QIOChannel *ioc,
315254db882fSLidong Chen QIOChannelShutdown how,
315354db882fSLidong Chen Error **errp)
315454db882fSLidong Chen {
315554db882fSLidong Chen QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
315654db882fSLidong Chen RDMAContext *rdmain, *rdmaout;
315754db882fSLidong Chen
3158987ab2a5SDr. David Alan Gilbert RCU_READ_LOCK_GUARD();
315954db882fSLidong Chen
3160d73415a3SStefan Hajnoczi rdmain = qatomic_rcu_read(&rioc->rdmain);
3161d73415a3SStefan Hajnoczi rdmaout = qatomic_rcu_read(&rioc->rdmain);
316254db882fSLidong Chen
316354db882fSLidong Chen switch (how) {
316454db882fSLidong Chen case QIO_CHANNEL_SHUTDOWN_READ:
316554db882fSLidong Chen if (rdmain) {
3166b86c94a4SMarkus Armbruster rdmain->errored = true;
316754db882fSLidong Chen }
316854db882fSLidong Chen break;
316954db882fSLidong Chen case QIO_CHANNEL_SHUTDOWN_WRITE:
317054db882fSLidong Chen if (rdmaout) {
3171b86c94a4SMarkus Armbruster rdmaout->errored = true;
317254db882fSLidong Chen }
317354db882fSLidong Chen break;
317454db882fSLidong Chen case QIO_CHANNEL_SHUTDOWN_BOTH:
317554db882fSLidong Chen default:
317654db882fSLidong Chen if (rdmain) {
3177b86c94a4SMarkus Armbruster rdmain->errored = true;
317854db882fSLidong Chen }
317954db882fSLidong Chen if (rdmaout) {
3180b86c94a4SMarkus Armbruster rdmaout->errored = true;
318154db882fSLidong Chen }
318254db882fSLidong Chen break;
318354db882fSLidong Chen }
318454db882fSLidong Chen
318554db882fSLidong Chen return 0;
318654db882fSLidong Chen }
318754db882fSLidong Chen
3188329c9b10SDr. David Alan Gilbert /*
3189329c9b10SDr. David Alan Gilbert * Parameters:
3190329c9b10SDr. David Alan Gilbert * @offset == 0 :
3191329c9b10SDr. David Alan Gilbert * This means that 'block_offset' is a full virtual address that does not
3192329c9b10SDr. David Alan Gilbert * belong to a RAMBlock of the virtual machine and instead
3193329c9b10SDr. David Alan Gilbert * represents a private malloc'd memory area that the caller wishes to
3194329c9b10SDr. David Alan Gilbert * transfer.
3195329c9b10SDr. David Alan Gilbert *
3196329c9b10SDr. David Alan Gilbert * @offset != 0 :
3197329c9b10SDr. David Alan Gilbert * Offset is an offset to be added to block_offset and used
3198329c9b10SDr. David Alan Gilbert * to also lookup the corresponding RAMBlock.
3199329c9b10SDr. David Alan Gilbert *
3200246683c2SDaniel P. Berrangé * @size : Number of bytes to transfer
3201329c9b10SDr. David Alan Gilbert *
32029c53d369SJuan Quintela * @pages_sent : User-specificed pointer to indicate how many pages were
3203329c9b10SDr. David Alan Gilbert * sent. Usually, this will not be more than a few bytes of
3204329c9b10SDr. David Alan Gilbert * the protocol because most transfers are sent asynchronously.
3205329c9b10SDr. David Alan Gilbert */
qemu_rdma_save_page(QEMUFile * f,ram_addr_t block_offset,ram_addr_t offset,size_t size)32069c53d369SJuan Quintela static int qemu_rdma_save_page(QEMUFile *f, ram_addr_t block_offset,
32079c53d369SJuan Quintela ram_addr_t offset, size_t size)
3208329c9b10SDr. David Alan Gilbert {
3209365c0463SDaniel P. Berrangé QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
3210446e559cSMarkus Armbruster Error *err = NULL;
321174637e6fSLidong Chen RDMAContext *rdma;
3212329c9b10SDr. David Alan Gilbert int ret;
3213329c9b10SDr. David Alan Gilbert
3214987ab2a5SDr. David Alan Gilbert RCU_READ_LOCK_GUARD();
3215d73415a3SStefan Hajnoczi rdma = qatomic_rcu_read(&rioc->rdmaout);
321674637e6fSLidong Chen
321774637e6fSLidong Chen if (!rdma) {
32180110c6b8SMarkus Armbruster return -1;
321974637e6fSLidong Chen }
322074637e6fSLidong Chen
3221b86c94a4SMarkus Armbruster if (rdma_errored(rdma)) {
32220110c6b8SMarkus Armbruster return -1;
3223de3e05e8SMarkus Armbruster }
3224329c9b10SDr. David Alan Gilbert
3225329c9b10SDr. David Alan Gilbert qemu_fflush(f);
3226329c9b10SDr. David Alan Gilbert
3227329c9b10SDr. David Alan Gilbert /*
3228329c9b10SDr. David Alan Gilbert * Add this page to the current 'chunk'. If the chunk
32293a4452d8Szhaolichang * is full, or the page doesn't belong to the current chunk,
3230329c9b10SDr. David Alan Gilbert * an actual RDMA write will occur and a new chunk will be formed.
3231329c9b10SDr. David Alan Gilbert */
3232446e559cSMarkus Armbruster ret = qemu_rdma_write(rdma, block_offset, offset, size, &err);
3233329c9b10SDr. David Alan Gilbert if (ret < 0) {
3234446e559cSMarkus Armbruster error_report_err(err);
3235329c9b10SDr. David Alan Gilbert goto err;
3236329c9b10SDr. David Alan Gilbert }
3237329c9b10SDr. David Alan Gilbert
3238329c9b10SDr. David Alan Gilbert /*
3239329c9b10SDr. David Alan Gilbert * Drain the Completion Queue if possible, but do not block,
3240329c9b10SDr. David Alan Gilbert * just poll.
3241329c9b10SDr. David Alan Gilbert *
3242329c9b10SDr. David Alan Gilbert * If nothing to poll, the end of the iteration will do this
3243329c9b10SDr. David Alan Gilbert * again to make sure we don't overflow the request queue.
3244329c9b10SDr. David Alan Gilbert */
3245329c9b10SDr. David Alan Gilbert while (1) {
3246329c9b10SDr. David Alan Gilbert uint64_t wr_id, wr_id_in;
3247bbde6562SMarkus Armbruster ret = qemu_rdma_poll(rdma, rdma->recv_cq, &wr_id_in, NULL);
3248bbde6562SMarkus Armbruster
3249b390afd8SLi Zhijian if (ret < 0) {
32501b6e1da6SMarkus Armbruster error_report("rdma migration: polling error");
3251b390afd8SLi Zhijian goto err;
3252b390afd8SLi Zhijian }
3253b390afd8SLi Zhijian
3254b390afd8SLi Zhijian wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
3255b390afd8SLi Zhijian
3256b390afd8SLi Zhijian if (wr_id == RDMA_WRID_NONE) {
3257b390afd8SLi Zhijian break;
3258b390afd8SLi Zhijian }
3259b390afd8SLi Zhijian }
3260b390afd8SLi Zhijian
3261b390afd8SLi Zhijian while (1) {
3262b390afd8SLi Zhijian uint64_t wr_id, wr_id_in;
3263bbde6562SMarkus Armbruster ret = qemu_rdma_poll(rdma, rdma->send_cq, &wr_id_in, NULL);
3264bbde6562SMarkus Armbruster
3265329c9b10SDr. David Alan Gilbert if (ret < 0) {
32661b6e1da6SMarkus Armbruster error_report("rdma migration: polling error");
3267329c9b10SDr. David Alan Gilbert goto err;
3268329c9b10SDr. David Alan Gilbert }
3269329c9b10SDr. David Alan Gilbert
3270329c9b10SDr. David Alan Gilbert wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
3271329c9b10SDr. David Alan Gilbert
3272329c9b10SDr. David Alan Gilbert if (wr_id == RDMA_WRID_NONE) {
3273329c9b10SDr. David Alan Gilbert break;
3274329c9b10SDr. David Alan Gilbert }
3275329c9b10SDr. David Alan Gilbert }
3276329c9b10SDr. David Alan Gilbert
3277329c9b10SDr. David Alan Gilbert return RAM_SAVE_CONTROL_DELAYED;
32780110c6b8SMarkus Armbruster
3279329c9b10SDr. David Alan Gilbert err:
3280b86c94a4SMarkus Armbruster rdma->errored = true;
32810110c6b8SMarkus Armbruster return -1;
3282329c9b10SDr. David Alan Gilbert }
3283329c9b10SDr. David Alan Gilbert
rdma_control_save_page(QEMUFile * f,ram_addr_t block_offset,ram_addr_t offset,size_t size)3284e493008dSJuan Quintela int rdma_control_save_page(QEMUFile *f, ram_addr_t block_offset,
3285e493008dSJuan Quintela ram_addr_t offset, size_t size)
3286e493008dSJuan Quintela {
3287a4832d29SJuan Quintela if (!migrate_rdma() || migration_in_postcopy()) {
3288e493008dSJuan Quintela return RAM_SAVE_CONTROL_NOT_SUPP;
3289e493008dSJuan Quintela }
3290e493008dSJuan Quintela
3291e493008dSJuan Quintela int ret = qemu_rdma_save_page(f, block_offset, offset, size);
3292e493008dSJuan Quintela
3293e493008dSJuan Quintela if (ret != RAM_SAVE_CONTROL_DELAYED &&
3294e493008dSJuan Quintela ret != RAM_SAVE_CONTROL_NOT_SUPP) {
3295e493008dSJuan Quintela if (ret < 0) {
3296e493008dSJuan Quintela qemu_file_set_error(f, ret);
3297e493008dSJuan Quintela }
3298e493008dSJuan Quintela }
3299e493008dSJuan Quintela return ret;
3300e493008dSJuan Quintela }
3301e493008dSJuan Quintela
330255cc1b59SLidong Chen static void rdma_accept_incoming_migration(void *opaque);
330355cc1b59SLidong Chen
rdma_cm_poll_handler(void * opaque)330492370989SLidong Chen static void rdma_cm_poll_handler(void *opaque)
330592370989SLidong Chen {
330692370989SLidong Chen RDMAContext *rdma = opaque;
330792370989SLidong Chen struct rdma_cm_event *cm_event;
330892370989SLidong Chen MigrationIncomingState *mis = migration_incoming_get_current();
330992370989SLidong Chen
33108f5a7faaSJuan Quintela if (rdma_get_cm_event(rdma->channel, &cm_event) < 0) {
331192370989SLidong Chen error_report("get_cm_event failed %d", errno);
331292370989SLidong Chen return;
331392370989SLidong Chen }
331492370989SLidong Chen
331592370989SLidong Chen if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
331692370989SLidong Chen cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
3317b86c94a4SMarkus Armbruster if (!rdma->errored &&
3318de8434a3SDr. David Alan Gilbert migration_incoming_get_current()->state !=
3319de8434a3SDr. David Alan Gilbert MIGRATION_STATUS_COMPLETED) {
332092370989SLidong Chen error_report("receive cm event, cm event is %d", cm_event->event);
3321b86c94a4SMarkus Armbruster rdma->errored = true;
332292370989SLidong Chen if (rdma->return_path) {
3323b86c94a4SMarkus Armbruster rdma->return_path->errored = true;
332492370989SLidong Chen }
3325de8434a3SDr. David Alan Gilbert }
33266b8c2eb5SLi Zhijian rdma_ack_cm_event(cm_event);
3327dd42ce24SVladimir Sementsov-Ogievskiy if (mis->loadvm_co) {
3328dd42ce24SVladimir Sementsov-Ogievskiy qemu_coroutine_enter(mis->loadvm_co);
332992370989SLidong Chen }
333092370989SLidong Chen return;
333192370989SLidong Chen }
33326b8c2eb5SLi Zhijian rdma_ack_cm_event(cm_event);
333392370989SLidong Chen }
333492370989SLidong Chen
qemu_rdma_accept(RDMAContext * rdma)3335329c9b10SDr. David Alan Gilbert static int qemu_rdma_accept(RDMAContext *rdma)
3336329c9b10SDr. David Alan Gilbert {
33373c0c3ebaSMarkus Armbruster Error *err = NULL;
3338329c9b10SDr. David Alan Gilbert RDMACapabilities cap;
3339329c9b10SDr. David Alan Gilbert struct rdma_conn_param conn_param = {
3340329c9b10SDr. David Alan Gilbert .responder_resources = 2,
3341329c9b10SDr. David Alan Gilbert .private_data = &cap,
3342329c9b10SDr. David Alan Gilbert .private_data_len = sizeof(cap),
3343329c9b10SDr. David Alan Gilbert };
334444bcfd45SLi Zhijian RDMAContext *rdma_return_path = NULL;
33453fa9642fSHet Gala g_autoptr(InetSocketAddress) isock = g_new0(InetSocketAddress, 1);
3346329c9b10SDr. David Alan Gilbert struct rdma_cm_event *cm_event;
3347329c9b10SDr. David Alan Gilbert struct ibv_context *verbs;
3348c0d77702SMarkus Armbruster int ret;
3349329c9b10SDr. David Alan Gilbert
3350329c9b10SDr. David Alan Gilbert ret = rdma_get_cm_event(rdma->channel, &cm_event);
33514a102179SMarkus Armbruster if (ret < 0) {
3352329c9b10SDr. David Alan Gilbert goto err_rdma_dest_wait;
3353329c9b10SDr. David Alan Gilbert }
3354329c9b10SDr. David Alan Gilbert
3355329c9b10SDr. David Alan Gilbert if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
3356329c9b10SDr. David Alan Gilbert rdma_ack_cm_event(cm_event);
3357329c9b10SDr. David Alan Gilbert goto err_rdma_dest_wait;
3358329c9b10SDr. David Alan Gilbert }
3359329c9b10SDr. David Alan Gilbert
3360*69f7b00dSYu Zhang isock->host = g_strdup(rdma->host);
33613fa9642fSHet Gala isock->port = g_strdup_printf("%d", rdma->port);
33623fa9642fSHet Gala
336344bcfd45SLi Zhijian /*
336444bcfd45SLi Zhijian * initialize the RDMAContext for return path for postcopy after first
336544bcfd45SLi Zhijian * connection request reached.
336644bcfd45SLi Zhijian */
336738ad1110SJuan Quintela if ((migrate_postcopy() || migrate_return_path())
3368a5382214SDr. David Alan Gilbert && !rdma->is_return_path) {
33693fa9642fSHet Gala rdma_return_path = qemu_rdma_data_init(isock, NULL);
337044bcfd45SLi Zhijian if (rdma_return_path == NULL) {
337144bcfd45SLi Zhijian rdma_ack_cm_event(cm_event);
337244bcfd45SLi Zhijian goto err_rdma_dest_wait;
337344bcfd45SLi Zhijian }
337444bcfd45SLi Zhijian
337544bcfd45SLi Zhijian qemu_rdma_return_path_dest_init(rdma_return_path, rdma);
337644bcfd45SLi Zhijian }
337744bcfd45SLi Zhijian
3378329c9b10SDr. David Alan Gilbert memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
3379329c9b10SDr. David Alan Gilbert
3380329c9b10SDr. David Alan Gilbert network_to_caps(&cap);
3381329c9b10SDr. David Alan Gilbert
3382329c9b10SDr. David Alan Gilbert if (cap.version < 1 || cap.version > RDMA_CONTROL_VERSION_CURRENT) {
3383733252deSDr. David Alan Gilbert error_report("Unknown source RDMA version: %d, bailing...",
3384329c9b10SDr. David Alan Gilbert cap.version);
3385329c9b10SDr. David Alan Gilbert rdma_ack_cm_event(cm_event);
3386329c9b10SDr. David Alan Gilbert goto err_rdma_dest_wait;
3387329c9b10SDr. David Alan Gilbert }
3388329c9b10SDr. David Alan Gilbert
3389329c9b10SDr. David Alan Gilbert /*
3390329c9b10SDr. David Alan Gilbert * Respond with only the capabilities this version of QEMU knows about.
3391329c9b10SDr. David Alan Gilbert */
3392329c9b10SDr. David Alan Gilbert cap.flags &= known_capabilities;
3393329c9b10SDr. David Alan Gilbert
3394329c9b10SDr. David Alan Gilbert /*
3395329c9b10SDr. David Alan Gilbert * Enable the ones that we do know about.
3396329c9b10SDr. David Alan Gilbert * Add other checks here as new ones are introduced.
3397329c9b10SDr. David Alan Gilbert */
3398329c9b10SDr. David Alan Gilbert if (cap.flags & RDMA_CAPABILITY_PIN_ALL) {
3399329c9b10SDr. David Alan Gilbert rdma->pin_all = true;
3400329c9b10SDr. David Alan Gilbert }
3401329c9b10SDr. David Alan Gilbert
3402329c9b10SDr. David Alan Gilbert rdma->cm_id = cm_event->id;
3403329c9b10SDr. David Alan Gilbert verbs = cm_event->id->verbs;
3404329c9b10SDr. David Alan Gilbert
3405329c9b10SDr. David Alan Gilbert rdma_ack_cm_event(cm_event);
3406329c9b10SDr. David Alan Gilbert
3407733252deSDr. David Alan Gilbert trace_qemu_rdma_accept_pin_state(rdma->pin_all);
3408329c9b10SDr. David Alan Gilbert
3409329c9b10SDr. David Alan Gilbert caps_to_network(&cap);
3410329c9b10SDr. David Alan Gilbert
3411733252deSDr. David Alan Gilbert trace_qemu_rdma_accept_pin_verbsc(verbs);
3412329c9b10SDr. David Alan Gilbert
3413329c9b10SDr. David Alan Gilbert if (!rdma->verbs) {
3414329c9b10SDr. David Alan Gilbert rdma->verbs = verbs;
3415329c9b10SDr. David Alan Gilbert } else if (rdma->verbs != verbs) {
3416733252deSDr. David Alan Gilbert error_report("ibv context not matching %p, %p!", rdma->verbs,
3417733252deSDr. David Alan Gilbert verbs);
3418329c9b10SDr. David Alan Gilbert goto err_rdma_dest_wait;
3419329c9b10SDr. David Alan Gilbert }
3420329c9b10SDr. David Alan Gilbert
3421329c9b10SDr. David Alan Gilbert qemu_rdma_dump_id("dest_init", verbs);
3422329c9b10SDr. David Alan Gilbert
342307d5b946SMarkus Armbruster ret = qemu_rdma_alloc_pd_cq(rdma, &err);
34244a102179SMarkus Armbruster if (ret < 0) {
342507d5b946SMarkus Armbruster error_report_err(err);
3426329c9b10SDr. David Alan Gilbert goto err_rdma_dest_wait;
3427329c9b10SDr. David Alan Gilbert }
3428329c9b10SDr. David Alan Gilbert
3429329c9b10SDr. David Alan Gilbert ret = qemu_rdma_alloc_qp(rdma);
34304a102179SMarkus Armbruster if (ret < 0) {
3431733252deSDr. David Alan Gilbert error_report("rdma migration: error allocating qp!");
3432329c9b10SDr. David Alan Gilbert goto err_rdma_dest_wait;
3433329c9b10SDr. David Alan Gilbert }
3434329c9b10SDr. David Alan Gilbert
34350610d7a1SMarkus Armbruster qemu_rdma_init_ram_blocks(rdma);
3436329c9b10SDr. David Alan Gilbert
3437ebdb85f9SJuan Quintela for (int i = 0; i < RDMA_WRID_MAX; i++) {
3438ebdb85f9SJuan Quintela ret = qemu_rdma_reg_control(rdma, i);
34394a102179SMarkus Armbruster if (ret < 0) {
3440ebdb85f9SJuan Quintela error_report("rdma: error registering %d control", i);
3441329c9b10SDr. David Alan Gilbert goto err_rdma_dest_wait;
3442329c9b10SDr. David Alan Gilbert }
3443329c9b10SDr. David Alan Gilbert }
3444329c9b10SDr. David Alan Gilbert
344555cc1b59SLidong Chen /* Accept the second connection request for return path */
344638ad1110SJuan Quintela if ((migrate_postcopy() || migrate_return_path())
3447a5382214SDr. David Alan Gilbert && !rdma->is_return_path) {
344855cc1b59SLidong Chen qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
344955cc1b59SLidong Chen NULL,
345055cc1b59SLidong Chen (void *)(intptr_t)rdma->return_path);
345155cc1b59SLidong Chen } else {
345292370989SLidong Chen qemu_set_fd_handler(rdma->channel->fd, rdma_cm_poll_handler,
345392370989SLidong Chen NULL, rdma);
345455cc1b59SLidong Chen }
3455329c9b10SDr. David Alan Gilbert
3456329c9b10SDr. David Alan Gilbert ret = rdma_accept(rdma->cm_id, &conn_param);
34574a102179SMarkus Armbruster if (ret < 0) {
34581b6e1da6SMarkus Armbruster error_report("rdma_accept failed");
3459329c9b10SDr. David Alan Gilbert goto err_rdma_dest_wait;
3460329c9b10SDr. David Alan Gilbert }
3461329c9b10SDr. David Alan Gilbert
3462329c9b10SDr. David Alan Gilbert ret = rdma_get_cm_event(rdma->channel, &cm_event);
34634a102179SMarkus Armbruster if (ret < 0) {
34641b6e1da6SMarkus Armbruster error_report("rdma_accept get_cm_event failed");
3465329c9b10SDr. David Alan Gilbert goto err_rdma_dest_wait;
3466329c9b10SDr. David Alan Gilbert }
3467329c9b10SDr. David Alan Gilbert
3468329c9b10SDr. David Alan Gilbert if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
3469733252deSDr. David Alan Gilbert error_report("rdma_accept not event established");
3470329c9b10SDr. David Alan Gilbert rdma_ack_cm_event(cm_event);
3471329c9b10SDr. David Alan Gilbert goto err_rdma_dest_wait;
3472329c9b10SDr. David Alan Gilbert }
3473329c9b10SDr. David Alan Gilbert
3474329c9b10SDr. David Alan Gilbert rdma_ack_cm_event(cm_event);
3475329c9b10SDr. David Alan Gilbert rdma->connected = true;
3476329c9b10SDr. David Alan Gilbert
34773c0c3ebaSMarkus Armbruster ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY, &err);
34784a102179SMarkus Armbruster if (ret < 0) {
34793c0c3ebaSMarkus Armbruster error_report_err(err);
3480329c9b10SDr. David Alan Gilbert goto err_rdma_dest_wait;
3481329c9b10SDr. David Alan Gilbert }
3482329c9b10SDr. David Alan Gilbert
3483329c9b10SDr. David Alan Gilbert qemu_rdma_dump_gid("dest_connect", rdma->cm_id);
3484329c9b10SDr. David Alan Gilbert
3485329c9b10SDr. David Alan Gilbert return 0;
3486329c9b10SDr. David Alan Gilbert
3487329c9b10SDr. David Alan Gilbert err_rdma_dest_wait:
3488b86c94a4SMarkus Armbruster rdma->errored = true;
3489329c9b10SDr. David Alan Gilbert qemu_rdma_cleanup(rdma);
349044bcfd45SLi Zhijian g_free(rdma_return_path);
3491ec486974SMarkus Armbruster return -1;
3492329c9b10SDr. David Alan Gilbert }
3493329c9b10SDr. David Alan Gilbert
dest_ram_sort_func(const void * a,const void * b)3494e4d63320SDr. David Alan Gilbert static int dest_ram_sort_func(const void *a, const void *b)
3495e4d63320SDr. David Alan Gilbert {
3496e4d63320SDr. David Alan Gilbert unsigned int a_index = ((const RDMALocalBlock *)a)->src_index;
3497e4d63320SDr. David Alan Gilbert unsigned int b_index = ((const RDMALocalBlock *)b)->src_index;
3498e4d63320SDr. David Alan Gilbert
3499e4d63320SDr. David Alan Gilbert return (a_index < b_index) ? -1 : (a_index != b_index);
3500e4d63320SDr. David Alan Gilbert }
3501e4d63320SDr. David Alan Gilbert
3502329c9b10SDr. David Alan Gilbert /*
3503329c9b10SDr. David Alan Gilbert * During each iteration of the migration, we listen for instructions
3504329c9b10SDr. David Alan Gilbert * by the source VM to perform dynamic page registrations before they
3505329c9b10SDr. David Alan Gilbert * can perform RDMA operations.
3506329c9b10SDr. David Alan Gilbert *
3507329c9b10SDr. David Alan Gilbert * We respond with the 'rkey'.
3508329c9b10SDr. David Alan Gilbert *
3509329c9b10SDr. David Alan Gilbert * Keep doing this until the source tells us to stop.
3510329c9b10SDr. David Alan Gilbert */
rdma_registration_handle(QEMUFile * f)3511b1b38387SJuan Quintela int rdma_registration_handle(QEMUFile *f)
3512329c9b10SDr. David Alan Gilbert {
3513329c9b10SDr. David Alan Gilbert RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult),
3514329c9b10SDr. David Alan Gilbert .type = RDMA_CONTROL_REGISTER_RESULT,
3515329c9b10SDr. David Alan Gilbert .repeat = 0,
3516329c9b10SDr. David Alan Gilbert };
3517329c9b10SDr. David Alan Gilbert RDMAControlHeader unreg_resp = { .len = 0,
3518329c9b10SDr. David Alan Gilbert .type = RDMA_CONTROL_UNREGISTER_FINISHED,
3519329c9b10SDr. David Alan Gilbert .repeat = 0,
3520329c9b10SDr. David Alan Gilbert };
3521329c9b10SDr. David Alan Gilbert RDMAControlHeader blocks = { .type = RDMA_CONTROL_RAM_BLOCKS_RESULT,
3522329c9b10SDr. David Alan Gilbert .repeat = 1 };
3523f6d6c089SJuan Quintela QIOChannelRDMA *rioc;
352496f363d8SMarkus Armbruster Error *err = NULL;
352574637e6fSLidong Chen RDMAContext *rdma;
352674637e6fSLidong Chen RDMALocalBlocks *local;
3527329c9b10SDr. David Alan Gilbert RDMAControlHeader head;
3528329c9b10SDr. David Alan Gilbert RDMARegister *reg, *registers;
3529329c9b10SDr. David Alan Gilbert RDMACompress *comp;
3530329c9b10SDr. David Alan Gilbert RDMARegisterResult *reg_result;
3531329c9b10SDr. David Alan Gilbert static RDMARegisterResult results[RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE];
3532329c9b10SDr. David Alan Gilbert RDMALocalBlock *block;
3533329c9b10SDr. David Alan Gilbert void *host_addr;
3534c0d77702SMarkus Armbruster int ret;
3535329c9b10SDr. David Alan Gilbert int idx = 0;
3536329c9b10SDr. David Alan Gilbert
3537f6d6c089SJuan Quintela if (!migrate_rdma()) {
3538f6d6c089SJuan Quintela return 0;
3539f6d6c089SJuan Quintela }
3540f6d6c089SJuan Quintela
3541987ab2a5SDr. David Alan Gilbert RCU_READ_LOCK_GUARD();
3542f6d6c089SJuan Quintela rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
3543d73415a3SStefan Hajnoczi rdma = qatomic_rcu_read(&rioc->rdmain);
354474637e6fSLidong Chen
354574637e6fSLidong Chen if (!rdma) {
35460110c6b8SMarkus Armbruster return -1;
354774637e6fSLidong Chen }
354874637e6fSLidong Chen
3549b86c94a4SMarkus Armbruster if (rdma_errored(rdma)) {
35500110c6b8SMarkus Armbruster return -1;
3551de3e05e8SMarkus Armbruster }
3552329c9b10SDr. David Alan Gilbert
355374637e6fSLidong Chen local = &rdma->local_ram_blocks;
3554329c9b10SDr. David Alan Gilbert do {
3555b1b38387SJuan Quintela trace_rdma_registration_handle_wait();
3556329c9b10SDr. David Alan Gilbert
355796f363d8SMarkus Armbruster ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_NONE, &err);
3558329c9b10SDr. David Alan Gilbert
3559329c9b10SDr. David Alan Gilbert if (ret < 0) {
356096f363d8SMarkus Armbruster error_report_err(err);
3561329c9b10SDr. David Alan Gilbert break;
3562329c9b10SDr. David Alan Gilbert }
3563329c9b10SDr. David Alan Gilbert
3564329c9b10SDr. David Alan Gilbert if (head.repeat > RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE) {
3565733252deSDr. David Alan Gilbert error_report("rdma: Too many requests in this message (%d)."
3566733252deSDr. David Alan Gilbert "Bailing.", head.repeat);
3567329c9b10SDr. David Alan Gilbert break;
3568329c9b10SDr. David Alan Gilbert }
3569329c9b10SDr. David Alan Gilbert
3570329c9b10SDr. David Alan Gilbert switch (head.type) {
3571329c9b10SDr. David Alan Gilbert case RDMA_CONTROL_COMPRESS:
3572329c9b10SDr. David Alan Gilbert comp = (RDMACompress *) rdma->wr_data[idx].control_curr;
3573329c9b10SDr. David Alan Gilbert network_to_compress(comp);
3574329c9b10SDr. David Alan Gilbert
3575b1b38387SJuan Quintela trace_rdma_registration_handle_compress(comp->length,
3576733252deSDr. David Alan Gilbert comp->block_idx,
3577733252deSDr. David Alan Gilbert comp->offset);
3578afcddefdSDr. David Alan Gilbert if (comp->block_idx >= rdma->local_ram_blocks.nb_blocks) {
3579afcddefdSDr. David Alan Gilbert error_report("rdma: 'compress' bad block index %u (vs %d)",
3580afcddefdSDr. David Alan Gilbert (unsigned int)comp->block_idx,
3581afcddefdSDr. David Alan Gilbert rdma->local_ram_blocks.nb_blocks);
35820110c6b8SMarkus Armbruster goto err;
3583afcddefdSDr. David Alan Gilbert }
3584329c9b10SDr. David Alan Gilbert block = &(rdma->local_ram_blocks.block[comp->block_idx]);
3585329c9b10SDr. David Alan Gilbert
3586329c9b10SDr. David Alan Gilbert host_addr = block->local_host_addr +
3587329c9b10SDr. David Alan Gilbert (comp->offset - block->offset);
3588413d64feSJuan Quintela if (comp->value) {
3589413d64feSJuan Quintela error_report("rdma: Zero page with non-zero (%d) value",
3590413d64feSJuan Quintela comp->value);
3591413d64feSJuan Quintela goto err;
3592413d64feSJuan Quintela }
35937091dabeSJuan Quintela ram_handle_zero(host_addr, comp->length);
3594329c9b10SDr. David Alan Gilbert break;
3595329c9b10SDr. David Alan Gilbert
3596329c9b10SDr. David Alan Gilbert case RDMA_CONTROL_REGISTER_FINISHED:
3597b1b38387SJuan Quintela trace_rdma_registration_handle_finished();
35980110c6b8SMarkus Armbruster return 0;
3599329c9b10SDr. David Alan Gilbert
3600329c9b10SDr. David Alan Gilbert case RDMA_CONTROL_RAM_BLOCKS_REQUEST:
3601b1b38387SJuan Quintela trace_rdma_registration_handle_ram_blocks();
3602329c9b10SDr. David Alan Gilbert
3603e4d63320SDr. David Alan Gilbert /* Sort our local RAM Block list so it's the same as the source,
3604e4d63320SDr. David Alan Gilbert * we can do this since we've filled in a src_index in the list
3605e4d63320SDr. David Alan Gilbert * as we received the RAMBlock list earlier.
3606e4d63320SDr. David Alan Gilbert */
3607e4d63320SDr. David Alan Gilbert qsort(rdma->local_ram_blocks.block,
3608e4d63320SDr. David Alan Gilbert rdma->local_ram_blocks.nb_blocks,
3609e4d63320SDr. David Alan Gilbert sizeof(RDMALocalBlock), dest_ram_sort_func);
361014e2fcbbSJuan Quintela for (int i = 0; i < local->nb_blocks; i++) {
361171cd7306SLidong Chen local->block[i].index = i;
361271cd7306SLidong Chen }
361371cd7306SLidong Chen
3614329c9b10SDr. David Alan Gilbert if (rdma->pin_all) {
3615de1aa35fSMarkus Armbruster ret = qemu_rdma_reg_whole_ram_blocks(rdma, &err);
36164a102179SMarkus Armbruster if (ret < 0) {
3617de1aa35fSMarkus Armbruster error_report_err(err);
36180110c6b8SMarkus Armbruster goto err;
3619329c9b10SDr. David Alan Gilbert }
3620329c9b10SDr. David Alan Gilbert }
3621329c9b10SDr. David Alan Gilbert
3622329c9b10SDr. David Alan Gilbert /*
3623329c9b10SDr. David Alan Gilbert * Dest uses this to prepare to transmit the RAMBlock descriptions
3624329c9b10SDr. David Alan Gilbert * to the source VM after connection setup.
3625329c9b10SDr. David Alan Gilbert * Both sides use the "remote" structure to communicate and update
3626329c9b10SDr. David Alan Gilbert * their "local" descriptions with what was sent.
3627329c9b10SDr. David Alan Gilbert */
362814e2fcbbSJuan Quintela for (int i = 0; i < local->nb_blocks; i++) {
3629a97270adSDr. David Alan Gilbert rdma->dest_blocks[i].remote_host_addr =
3630fbce8c25SStefan Weil (uintptr_t)(local->block[i].local_host_addr);
3631329c9b10SDr. David Alan Gilbert
3632329c9b10SDr. David Alan Gilbert if (rdma->pin_all) {
3633a97270adSDr. David Alan Gilbert rdma->dest_blocks[i].remote_rkey = local->block[i].mr->rkey;
3634329c9b10SDr. David Alan Gilbert }
3635329c9b10SDr. David Alan Gilbert
3636a97270adSDr. David Alan Gilbert rdma->dest_blocks[i].offset = local->block[i].offset;
3637a97270adSDr. David Alan Gilbert rdma->dest_blocks[i].length = local->block[i].length;
3638329c9b10SDr. David Alan Gilbert
3639a97270adSDr. David Alan Gilbert dest_block_to_network(&rdma->dest_blocks[i]);
3640b1b38387SJuan Quintela trace_rdma_registration_handle_ram_blocks_loop(
3641e4d63320SDr. David Alan Gilbert local->block[i].block_name,
3642e4d63320SDr. David Alan Gilbert local->block[i].offset,
3643e4d63320SDr. David Alan Gilbert local->block[i].length,
3644e4d63320SDr. David Alan Gilbert local->block[i].local_host_addr,
3645e4d63320SDr. David Alan Gilbert local->block[i].src_index);
3646329c9b10SDr. David Alan Gilbert }
3647329c9b10SDr. David Alan Gilbert
3648329c9b10SDr. David Alan Gilbert blocks.len = rdma->local_ram_blocks.nb_blocks
3649a97270adSDr. David Alan Gilbert * sizeof(RDMADestBlock);
3650329c9b10SDr. David Alan Gilbert
3651329c9b10SDr. David Alan Gilbert
3652329c9b10SDr. David Alan Gilbert ret = qemu_rdma_post_send_control(rdma,
3653f3805964SMarkus Armbruster (uint8_t *) rdma->dest_blocks, &blocks,
3654f3805964SMarkus Armbruster &err);
3655329c9b10SDr. David Alan Gilbert
3656329c9b10SDr. David Alan Gilbert if (ret < 0) {
3657f3805964SMarkus Armbruster error_report_err(err);
36580110c6b8SMarkus Armbruster goto err;
3659329c9b10SDr. David Alan Gilbert }
3660329c9b10SDr. David Alan Gilbert
3661329c9b10SDr. David Alan Gilbert break;
3662329c9b10SDr. David Alan Gilbert case RDMA_CONTROL_REGISTER_REQUEST:
3663b1b38387SJuan Quintela trace_rdma_registration_handle_register(head.repeat);
3664329c9b10SDr. David Alan Gilbert
3665329c9b10SDr. David Alan Gilbert reg_resp.repeat = head.repeat;
3666329c9b10SDr. David Alan Gilbert registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
3667329c9b10SDr. David Alan Gilbert
366814e2fcbbSJuan Quintela for (int count = 0; count < head.repeat; count++) {
3669329c9b10SDr. David Alan Gilbert uint64_t chunk;
3670329c9b10SDr. David Alan Gilbert uint8_t *chunk_start, *chunk_end;
3671329c9b10SDr. David Alan Gilbert
3672329c9b10SDr. David Alan Gilbert reg = ®isters[count];
3673329c9b10SDr. David Alan Gilbert network_to_register(reg);
3674329c9b10SDr. David Alan Gilbert
3675329c9b10SDr. David Alan Gilbert reg_result = &results[count];
3676329c9b10SDr. David Alan Gilbert
3677b1b38387SJuan Quintela trace_rdma_registration_handle_register_loop(count,
3678329c9b10SDr. David Alan Gilbert reg->current_index, reg->key.current_addr, reg->chunks);
3679329c9b10SDr. David Alan Gilbert
3680afcddefdSDr. David Alan Gilbert if (reg->current_index >= rdma->local_ram_blocks.nb_blocks) {
3681afcddefdSDr. David Alan Gilbert error_report("rdma: 'register' bad block index %u (vs %d)",
3682afcddefdSDr. David Alan Gilbert (unsigned int)reg->current_index,
3683afcddefdSDr. David Alan Gilbert rdma->local_ram_blocks.nb_blocks);
36840110c6b8SMarkus Armbruster goto err;
3685afcddefdSDr. David Alan Gilbert }
3686329c9b10SDr. David Alan Gilbert block = &(rdma->local_ram_blocks.block[reg->current_index]);
3687329c9b10SDr. David Alan Gilbert if (block->is_ram_block) {
3688afcddefdSDr. David Alan Gilbert if (block->offset > reg->key.current_addr) {
3689afcddefdSDr. David Alan Gilbert error_report("rdma: bad register address for block %s"
3690afcddefdSDr. David Alan Gilbert " offset: %" PRIx64 " current_addr: %" PRIx64,
3691afcddefdSDr. David Alan Gilbert block->block_name, block->offset,
3692afcddefdSDr. David Alan Gilbert reg->key.current_addr);
36930110c6b8SMarkus Armbruster goto err;
3694afcddefdSDr. David Alan Gilbert }
3695329c9b10SDr. David Alan Gilbert host_addr = (block->local_host_addr +
3696329c9b10SDr. David Alan Gilbert (reg->key.current_addr - block->offset));
3697329c9b10SDr. David Alan Gilbert chunk = ram_chunk_index(block->local_host_addr,
3698329c9b10SDr. David Alan Gilbert (uint8_t *) host_addr);
3699329c9b10SDr. David Alan Gilbert } else {
3700329c9b10SDr. David Alan Gilbert chunk = reg->key.chunk;
3701329c9b10SDr. David Alan Gilbert host_addr = block->local_host_addr +
3702329c9b10SDr. David Alan Gilbert (reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT));
3703afcddefdSDr. David Alan Gilbert /* Check for particularly bad chunk value */
3704afcddefdSDr. David Alan Gilbert if (host_addr < (void *)block->local_host_addr) {
3705afcddefdSDr. David Alan Gilbert error_report("rdma: bad chunk for block %s"
3706afcddefdSDr. David Alan Gilbert " chunk: %" PRIx64,
3707afcddefdSDr. David Alan Gilbert block->block_name, reg->key.chunk);
37080110c6b8SMarkus Armbruster goto err;
3709afcddefdSDr. David Alan Gilbert }
3710329c9b10SDr. David Alan Gilbert }
3711329c9b10SDr. David Alan Gilbert chunk_start = ram_chunk_start(block, chunk);
3712329c9b10SDr. David Alan Gilbert chunk_end = ram_chunk_end(block, chunk + reg->chunks);
37139589e763SMarcel Apfelbaum /* avoid "-Waddress-of-packed-member" warning */
37149589e763SMarcel Apfelbaum uint32_t tmp_rkey = 0;
3715329c9b10SDr. David Alan Gilbert if (qemu_rdma_register_and_get_keys(rdma, block,
37169589e763SMarcel Apfelbaum (uintptr_t)host_addr, NULL, &tmp_rkey,
3717329c9b10SDr. David Alan Gilbert chunk, chunk_start, chunk_end)) {
3718733252deSDr. David Alan Gilbert error_report("cannot get rkey");
37190110c6b8SMarkus Armbruster goto err;
3720329c9b10SDr. David Alan Gilbert }
37219589e763SMarcel Apfelbaum reg_result->rkey = tmp_rkey;
3722329c9b10SDr. David Alan Gilbert
3723fbce8c25SStefan Weil reg_result->host_addr = (uintptr_t)block->local_host_addr;
3724329c9b10SDr. David Alan Gilbert
3725b1b38387SJuan Quintela trace_rdma_registration_handle_register_rkey(reg_result->rkey);
3726329c9b10SDr. David Alan Gilbert
3727329c9b10SDr. David Alan Gilbert result_to_network(reg_result);
3728329c9b10SDr. David Alan Gilbert }
3729329c9b10SDr. David Alan Gilbert
3730329c9b10SDr. David Alan Gilbert ret = qemu_rdma_post_send_control(rdma,
3731f3805964SMarkus Armbruster (uint8_t *) results, ®_resp, &err);
3732329c9b10SDr. David Alan Gilbert
3733329c9b10SDr. David Alan Gilbert if (ret < 0) {
3734f3805964SMarkus Armbruster error_report_err(err);
37350110c6b8SMarkus Armbruster goto err;
3736329c9b10SDr. David Alan Gilbert }
3737329c9b10SDr. David Alan Gilbert break;
3738329c9b10SDr. David Alan Gilbert case RDMA_CONTROL_UNREGISTER_REQUEST:
3739b1b38387SJuan Quintela trace_rdma_registration_handle_unregister(head.repeat);
3740329c9b10SDr. David Alan Gilbert unreg_resp.repeat = head.repeat;
3741329c9b10SDr. David Alan Gilbert registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
3742329c9b10SDr. David Alan Gilbert
374314e2fcbbSJuan Quintela for (int count = 0; count < head.repeat; count++) {
3744329c9b10SDr. David Alan Gilbert reg = ®isters[count];
3745329c9b10SDr. David Alan Gilbert network_to_register(reg);
3746329c9b10SDr. David Alan Gilbert
3747b1b38387SJuan Quintela trace_rdma_registration_handle_unregister_loop(count,
3748733252deSDr. David Alan Gilbert reg->current_index, reg->key.chunk);
3749329c9b10SDr. David Alan Gilbert
3750329c9b10SDr. David Alan Gilbert block = &(rdma->local_ram_blocks.block[reg->current_index]);
3751329c9b10SDr. David Alan Gilbert
3752329c9b10SDr. David Alan Gilbert ret = ibv_dereg_mr(block->pmr[reg->key.chunk]);
3753329c9b10SDr. David Alan Gilbert block->pmr[reg->key.chunk] = NULL;
3754329c9b10SDr. David Alan Gilbert
3755329c9b10SDr. David Alan Gilbert if (ret != 0) {
3756ff4c9194SMarkus Armbruster error_report("rdma unregistration chunk failed: %s",
3757ff4c9194SMarkus Armbruster strerror(errno));
37580110c6b8SMarkus Armbruster goto err;
3759329c9b10SDr. David Alan Gilbert }
3760329c9b10SDr. David Alan Gilbert
3761329c9b10SDr. David Alan Gilbert rdma->total_registrations--;
3762329c9b10SDr. David Alan Gilbert
3763b1b38387SJuan Quintela trace_rdma_registration_handle_unregister_success(reg->key.chunk);
3764329c9b10SDr. David Alan Gilbert }
3765329c9b10SDr. David Alan Gilbert
3766f3805964SMarkus Armbruster ret = qemu_rdma_post_send_control(rdma, NULL, &unreg_resp, &err);
3767329c9b10SDr. David Alan Gilbert
3768329c9b10SDr. David Alan Gilbert if (ret < 0) {
3769f3805964SMarkus Armbruster error_report_err(err);
37700110c6b8SMarkus Armbruster goto err;
3771329c9b10SDr. David Alan Gilbert }
3772329c9b10SDr. David Alan Gilbert break;
3773329c9b10SDr. David Alan Gilbert case RDMA_CONTROL_REGISTER_RESULT:
3774733252deSDr. David Alan Gilbert error_report("Invalid RESULT message at dest.");
37750110c6b8SMarkus Armbruster goto err;
3776329c9b10SDr. David Alan Gilbert default:
3777482a33c5SDr. David Alan Gilbert error_report("Unknown control message %s", control_desc(head.type));
37780110c6b8SMarkus Armbruster goto err;
3779329c9b10SDr. David Alan Gilbert }
3780329c9b10SDr. David Alan Gilbert } while (1);
37810110c6b8SMarkus Armbruster
37820110c6b8SMarkus Armbruster err:
3783b86c94a4SMarkus Armbruster rdma->errored = true;
37840110c6b8SMarkus Armbruster return -1;
3785329c9b10SDr. David Alan Gilbert }
3786329c9b10SDr. David Alan Gilbert
3787e4d63320SDr. David Alan Gilbert /* Destination:
3788a6323300SJuan Quintela * Called during the initial RAM load section which lists the
3789a6323300SJuan Quintela * RAMBlocks by name. This lets us know the order of the RAMBlocks on
3790a6323300SJuan Quintela * the source. We've already built our local RAMBlock list, but not
3791a6323300SJuan Quintela * yet sent the list to the source.
3792e4d63320SDr. David Alan Gilbert */
rdma_block_notification_handle(QEMUFile * f,const char * name)3793a6323300SJuan Quintela int rdma_block_notification_handle(QEMUFile *f, const char *name)
3794e4d63320SDr. David Alan Gilbert {
3795e4d63320SDr. David Alan Gilbert int curr;
3796e4d63320SDr. David Alan Gilbert int found = -1;
3797e4d63320SDr. David Alan Gilbert
3798a6323300SJuan Quintela if (!migrate_rdma()) {
3799a6323300SJuan Quintela return 0;
3800a6323300SJuan Quintela }
3801a6323300SJuan Quintela
3802987ab2a5SDr. David Alan Gilbert RCU_READ_LOCK_GUARD();
3803a6323300SJuan Quintela QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
3804a6323300SJuan Quintela RDMAContext *rdma = qatomic_rcu_read(&rioc->rdmain);
380574637e6fSLidong Chen
380674637e6fSLidong Chen if (!rdma) {
38070110c6b8SMarkus Armbruster return -1;
380874637e6fSLidong Chen }
380974637e6fSLidong Chen
3810e4d63320SDr. David Alan Gilbert /* Find the matching RAMBlock in our local list */
3811e4d63320SDr. David Alan Gilbert for (curr = 0; curr < rdma->local_ram_blocks.nb_blocks; curr++) {
3812e4d63320SDr. David Alan Gilbert if (!strcmp(rdma->local_ram_blocks.block[curr].block_name, name)) {
3813e4d63320SDr. David Alan Gilbert found = curr;
3814e4d63320SDr. David Alan Gilbert break;
3815e4d63320SDr. David Alan Gilbert }
3816e4d63320SDr. David Alan Gilbert }
3817e4d63320SDr. David Alan Gilbert
3818e4d63320SDr. David Alan Gilbert if (found == -1) {
3819e4d63320SDr. David Alan Gilbert error_report("RAMBlock '%s' not found on destination", name);
38200110c6b8SMarkus Armbruster return -1;
3821e4d63320SDr. David Alan Gilbert }
3822e4d63320SDr. David Alan Gilbert
3823e4d63320SDr. David Alan Gilbert rdma->local_ram_blocks.block[curr].src_index = rdma->next_src_index;
3824e4d63320SDr. David Alan Gilbert trace_rdma_block_notification_handle(name, rdma->next_src_index);
3825e4d63320SDr. David Alan Gilbert rdma->next_src_index++;
3826e4d63320SDr. David Alan Gilbert
3827e4d63320SDr. David Alan Gilbert return 0;
3828e4d63320SDr. David Alan Gilbert }
3829e4d63320SDr. David Alan Gilbert
rdma_registration_start(QEMUFile * f,uint64_t flags)3830b1b38387SJuan Quintela int rdma_registration_start(QEMUFile *f, uint64_t flags)
3831329c9b10SDr. David Alan Gilbert {
383248408174SJuan Quintela if (!migrate_rdma() || migration_in_postcopy()) {
3833cd01a602SJuan Quintela return 0;
3834cd01a602SJuan Quintela }
3835cd01a602SJuan Quintela
383648408174SJuan Quintela QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
3837987ab2a5SDr. David Alan Gilbert RCU_READ_LOCK_GUARD();
383848408174SJuan Quintela RDMAContext *rdma = qatomic_rcu_read(&rioc->rdmaout);
383974637e6fSLidong Chen if (!rdma) {
38400110c6b8SMarkus Armbruster return -1;
384174637e6fSLidong Chen }
3842329c9b10SDr. David Alan Gilbert
3843b86c94a4SMarkus Armbruster if (rdma_errored(rdma)) {
38440110c6b8SMarkus Armbruster return -1;
3845de3e05e8SMarkus Armbruster }
3846329c9b10SDr. David Alan Gilbert
3847b1b38387SJuan Quintela trace_rdma_registration_start(flags);
3848329c9b10SDr. David Alan Gilbert qemu_put_be64(f, RAM_SAVE_FLAG_HOOK);
3849be07a0edSJuan Quintela return qemu_fflush(f);
3850329c9b10SDr. David Alan Gilbert }
3851329c9b10SDr. David Alan Gilbert
3852329c9b10SDr. David Alan Gilbert /*
3853329c9b10SDr. David Alan Gilbert * Inform dest that dynamic registrations are done for now.
3854329c9b10SDr. David Alan Gilbert * First, flush writes, if any.
3855329c9b10SDr. David Alan Gilbert */
rdma_registration_stop(QEMUFile * f,uint64_t flags)3856b1b38387SJuan Quintela int rdma_registration_stop(QEMUFile *f, uint64_t flags)
3857329c9b10SDr. David Alan Gilbert {
38585f5b8858SJuan Quintela QIOChannelRDMA *rioc;
3859c4c78dceSMarkus Armbruster Error *err = NULL;
386074637e6fSLidong Chen RDMAContext *rdma;
3861329c9b10SDr. David Alan Gilbert RDMAControlHeader head = { .len = 0, .repeat = 1 };
3862c0d77702SMarkus Armbruster int ret;
3863329c9b10SDr. David Alan Gilbert
38645f5b8858SJuan Quintela if (!migrate_rdma() || migration_in_postcopy()) {
3865cd01a602SJuan Quintela return 0;
3866cd01a602SJuan Quintela }
3867cd01a602SJuan Quintela
3868987ab2a5SDr. David Alan Gilbert RCU_READ_LOCK_GUARD();
38695f5b8858SJuan Quintela rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
3870d73415a3SStefan Hajnoczi rdma = qatomic_rcu_read(&rioc->rdmaout);
387174637e6fSLidong Chen if (!rdma) {
38720110c6b8SMarkus Armbruster return -1;
387374637e6fSLidong Chen }
387474637e6fSLidong Chen
3875b86c94a4SMarkus Armbruster if (rdma_errored(rdma)) {
38760110c6b8SMarkus Armbruster return -1;
3877de3e05e8SMarkus Armbruster }
3878329c9b10SDr. David Alan Gilbert
3879329c9b10SDr. David Alan Gilbert qemu_fflush(f);
3880e3378035SJuan Quintela ret = qemu_rdma_drain_cq(rdma);
3881329c9b10SDr. David Alan Gilbert
3882329c9b10SDr. David Alan Gilbert if (ret < 0) {
3883329c9b10SDr. David Alan Gilbert goto err;
3884329c9b10SDr. David Alan Gilbert }
3885329c9b10SDr. David Alan Gilbert
3886329c9b10SDr. David Alan Gilbert if (flags == RAM_CONTROL_SETUP) {
3887329c9b10SDr. David Alan Gilbert RDMAControlHeader resp = {.type = RDMA_CONTROL_RAM_BLOCKS_RESULT };
3888329c9b10SDr. David Alan Gilbert RDMALocalBlocks *local = &rdma->local_ram_blocks;
388914e2fcbbSJuan Quintela int reg_result_idx, nb_dest_blocks;
3890329c9b10SDr. David Alan Gilbert
3891329c9b10SDr. David Alan Gilbert head.type = RDMA_CONTROL_RAM_BLOCKS_REQUEST;
3892b1b38387SJuan Quintela trace_rdma_registration_stop_ram();
3893329c9b10SDr. David Alan Gilbert
3894329c9b10SDr. David Alan Gilbert /*
3895329c9b10SDr. David Alan Gilbert * Make sure that we parallelize the pinning on both sides.
3896329c9b10SDr. David Alan Gilbert * For very large guests, doing this serially takes a really
3897329c9b10SDr. David Alan Gilbert * long time, so we have to 'interleave' the pinning locally
3898329c9b10SDr. David Alan Gilbert * with the control messages by performing the pinning on this
3899329c9b10SDr. David Alan Gilbert * side before we receive the control response from the other
3900329c9b10SDr. David Alan Gilbert * side that the pinning has completed.
3901329c9b10SDr. David Alan Gilbert */
3902329c9b10SDr. David Alan Gilbert ret = qemu_rdma_exchange_send(rdma, &head, NULL, &resp,
3903329c9b10SDr. David Alan Gilbert ®_result_idx, rdma->pin_all ?
3904c4c78dceSMarkus Armbruster qemu_rdma_reg_whole_ram_blocks : NULL,
3905c4c78dceSMarkus Armbruster &err);
3906329c9b10SDr. David Alan Gilbert if (ret < 0) {
3907c4c78dceSMarkus Armbruster error_report_err(err);
39080110c6b8SMarkus Armbruster return -1;
3909329c9b10SDr. David Alan Gilbert }
3910329c9b10SDr. David Alan Gilbert
3911a97270adSDr. David Alan Gilbert nb_dest_blocks = resp.len / sizeof(RDMADestBlock);
3912329c9b10SDr. David Alan Gilbert
3913329c9b10SDr. David Alan Gilbert /*
3914329c9b10SDr. David Alan Gilbert * The protocol uses two different sets of rkeys (mutually exclusive):
3915329c9b10SDr. David Alan Gilbert * 1. One key to represent the virtual address of the entire ram block.
3916329c9b10SDr. David Alan Gilbert * (dynamic chunk registration disabled - pin everything with one rkey.)
3917329c9b10SDr. David Alan Gilbert * 2. One to represent individual chunks within a ram block.
3918329c9b10SDr. David Alan Gilbert * (dynamic chunk registration enabled - pin individual chunks.)
3919329c9b10SDr. David Alan Gilbert *
3920329c9b10SDr. David Alan Gilbert * Once the capability is successfully negotiated, the destination transmits
3921329c9b10SDr. David Alan Gilbert * the keys to use (or sends them later) including the virtual addresses
3922329c9b10SDr. David Alan Gilbert * and then propagates the remote ram block descriptions to his local copy.
3923329c9b10SDr. David Alan Gilbert */
3924329c9b10SDr. David Alan Gilbert
3925a97270adSDr. David Alan Gilbert if (local->nb_blocks != nb_dest_blocks) {
3926ff4c9194SMarkus Armbruster error_report("ram blocks mismatch (Number of blocks %d vs %d)",
3927e4d63320SDr. David Alan Gilbert local->nb_blocks, nb_dest_blocks);
3928ff4c9194SMarkus Armbruster error_printf("Your QEMU command line parameters are probably "
3929ff4c9194SMarkus Armbruster "not identical on both the source and destination.");
3930b86c94a4SMarkus Armbruster rdma->errored = true;
39310110c6b8SMarkus Armbruster return -1;
3932329c9b10SDr. David Alan Gilbert }
3933329c9b10SDr. David Alan Gilbert
3934329c9b10SDr. David Alan Gilbert qemu_rdma_move_header(rdma, reg_result_idx, &resp);
3935a97270adSDr. David Alan Gilbert memcpy(rdma->dest_blocks,
3936329c9b10SDr. David Alan Gilbert rdma->wr_data[reg_result_idx].control_curr, resp.len);
393714e2fcbbSJuan Quintela for (int i = 0; i < nb_dest_blocks; i++) {
3938a97270adSDr. David Alan Gilbert network_to_dest_block(&rdma->dest_blocks[i]);
3939329c9b10SDr. David Alan Gilbert
3940e4d63320SDr. David Alan Gilbert /* We require that the blocks are in the same order */
3941e4d63320SDr. David Alan Gilbert if (rdma->dest_blocks[i].length != local->block[i].length) {
3942ff4c9194SMarkus Armbruster error_report("Block %s/%d has a different length %" PRIu64
3943ff4c9194SMarkus Armbruster "vs %" PRIu64,
3944ff4c9194SMarkus Armbruster local->block[i].block_name, i,
3945e4d63320SDr. David Alan Gilbert local->block[i].length,
3946e4d63320SDr. David Alan Gilbert rdma->dest_blocks[i].length);
3947b86c94a4SMarkus Armbruster rdma->errored = true;
39480110c6b8SMarkus Armbruster return -1;
3949329c9b10SDr. David Alan Gilbert }
3950e4d63320SDr. David Alan Gilbert local->block[i].remote_host_addr =
3951a97270adSDr. David Alan Gilbert rdma->dest_blocks[i].remote_host_addr;
3952e4d63320SDr. David Alan Gilbert local->block[i].remote_rkey = rdma->dest_blocks[i].remote_rkey;
3953329c9b10SDr. David Alan Gilbert }
3954329c9b10SDr. David Alan Gilbert }
3955329c9b10SDr. David Alan Gilbert
3956b1b38387SJuan Quintela trace_rdma_registration_stop(flags);
3957329c9b10SDr. David Alan Gilbert
3958329c9b10SDr. David Alan Gilbert head.type = RDMA_CONTROL_REGISTER_FINISHED;
3959c4c78dceSMarkus Armbruster ret = qemu_rdma_exchange_send(rdma, &head, NULL, NULL, NULL, NULL, &err);
3960329c9b10SDr. David Alan Gilbert
3961329c9b10SDr. David Alan Gilbert if (ret < 0) {
3962c4c78dceSMarkus Armbruster error_report_err(err);
3963329c9b10SDr. David Alan Gilbert goto err;
3964329c9b10SDr. David Alan Gilbert }
3965329c9b10SDr. David Alan Gilbert
3966329c9b10SDr. David Alan Gilbert return 0;
3967329c9b10SDr. David Alan Gilbert err:
3968b86c94a4SMarkus Armbruster rdma->errored = true;
39690110c6b8SMarkus Armbruster return -1;
3970329c9b10SDr. David Alan Gilbert }
3971329c9b10SDr. David Alan Gilbert
qio_channel_rdma_finalize(Object * obj)39726ddd2d76SDaniel P. Berrange static void qio_channel_rdma_finalize(Object *obj)
3973329c9b10SDr. David Alan Gilbert {
39746ddd2d76SDaniel P. Berrange QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(obj);
397574637e6fSLidong Chen if (rioc->rdmain) {
397674637e6fSLidong Chen qemu_rdma_cleanup(rioc->rdmain);
397774637e6fSLidong Chen g_free(rioc->rdmain);
397874637e6fSLidong Chen rioc->rdmain = NULL;
397974637e6fSLidong Chen }
398074637e6fSLidong Chen if (rioc->rdmaout) {
398174637e6fSLidong Chen qemu_rdma_cleanup(rioc->rdmaout);
398274637e6fSLidong Chen g_free(rioc->rdmaout);
398374637e6fSLidong Chen rioc->rdmaout = NULL;
39846ddd2d76SDaniel P. Berrange }
39856ddd2d76SDaniel P. Berrange }
39866ddd2d76SDaniel P. Berrange
qio_channel_rdma_class_init(ObjectClass * klass,void * class_data G_GNUC_UNUSED)39876ddd2d76SDaniel P. Berrange static void qio_channel_rdma_class_init(ObjectClass *klass,
39886ddd2d76SDaniel P. Berrange void *class_data G_GNUC_UNUSED)
39896ddd2d76SDaniel P. Berrange {
39906ddd2d76SDaniel P. Berrange QIOChannelClass *ioc_klass = QIO_CHANNEL_CLASS(klass);
39916ddd2d76SDaniel P. Berrange
39926ddd2d76SDaniel P. Berrange ioc_klass->io_writev = qio_channel_rdma_writev;
39936ddd2d76SDaniel P. Berrange ioc_klass->io_readv = qio_channel_rdma_readv;
39946ddd2d76SDaniel P. Berrange ioc_klass->io_set_blocking = qio_channel_rdma_set_blocking;
39956ddd2d76SDaniel P. Berrange ioc_klass->io_close = qio_channel_rdma_close;
39966ddd2d76SDaniel P. Berrange ioc_klass->io_create_watch = qio_channel_rdma_create_watch;
39974d9f675bSLidong Chen ioc_klass->io_set_aio_fd_handler = qio_channel_rdma_set_aio_fd_handler;
399854db882fSLidong Chen ioc_klass->io_shutdown = qio_channel_rdma_shutdown;
39996ddd2d76SDaniel P. Berrange }
40006ddd2d76SDaniel P. Berrange
40016ddd2d76SDaniel P. Berrange static const TypeInfo qio_channel_rdma_info = {
40026ddd2d76SDaniel P. Berrange .parent = TYPE_QIO_CHANNEL,
40036ddd2d76SDaniel P. Berrange .name = TYPE_QIO_CHANNEL_RDMA,
40046ddd2d76SDaniel P. Berrange .instance_size = sizeof(QIOChannelRDMA),
40056ddd2d76SDaniel P. Berrange .instance_finalize = qio_channel_rdma_finalize,
40066ddd2d76SDaniel P. Berrange .class_init = qio_channel_rdma_class_init,
40076ddd2d76SDaniel P. Berrange };
40086ddd2d76SDaniel P. Berrange
qio_channel_rdma_register_types(void)40096ddd2d76SDaniel P. Berrange static void qio_channel_rdma_register_types(void)
40106ddd2d76SDaniel P. Berrange {
40116ddd2d76SDaniel P. Berrange type_register_static(&qio_channel_rdma_info);
40126ddd2d76SDaniel P. Berrange }
40136ddd2d76SDaniel P. Berrange
40146ddd2d76SDaniel P. Berrange type_init(qio_channel_rdma_register_types);
40156ddd2d76SDaniel P. Berrange
rdma_new_input(RDMAContext * rdma)4016697c4c86SJuan Quintela static QEMUFile *rdma_new_input(RDMAContext *rdma)
40176ddd2d76SDaniel P. Berrange {
4018697c4c86SJuan Quintela QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(object_new(TYPE_QIO_CHANNEL_RDMA));
4019329c9b10SDr. David Alan Gilbert
402077ef2dc1SDaniel P. Berrangé rioc->file = qemu_file_new_input(QIO_CHANNEL(rioc));
402174637e6fSLidong Chen rioc->rdmain = rdma;
402274637e6fSLidong Chen rioc->rdmaout = rdma->return_path;
4023697c4c86SJuan Quintela
4024697c4c86SJuan Quintela return rioc->file;
4025329c9b10SDr. David Alan Gilbert }
4026329c9b10SDr. David Alan Gilbert
rdma_new_output(RDMAContext * rdma)4027697c4c86SJuan Quintela static QEMUFile *rdma_new_output(RDMAContext *rdma)
4028697c4c86SJuan Quintela {
4029697c4c86SJuan Quintela QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(object_new(TYPE_QIO_CHANNEL_RDMA));
4030697c4c86SJuan Quintela
4031697c4c86SJuan Quintela rioc->file = qemu_file_new_output(QIO_CHANNEL(rioc));
4032697c4c86SJuan Quintela rioc->rdmaout = rdma;
4033697c4c86SJuan Quintela rioc->rdmain = rdma->return_path;
4034697c4c86SJuan Quintela
40356ddd2d76SDaniel P. Berrange return rioc->file;
4036329c9b10SDr. David Alan Gilbert }
4037329c9b10SDr. David Alan Gilbert
rdma_accept_incoming_migration(void * opaque)4038329c9b10SDr. David Alan Gilbert static void rdma_accept_incoming_migration(void *opaque)
4039329c9b10SDr. David Alan Gilbert {
4040329c9b10SDr. David Alan Gilbert RDMAContext *rdma = opaque;
4041329c9b10SDr. David Alan Gilbert QEMUFile *f;
4042329c9b10SDr. David Alan Gilbert
404324ec68efSDr. David Alan Gilbert trace_qemu_rdma_accept_incoming_migration();
40448f5a7faaSJuan Quintela if (qemu_rdma_accept(rdma) < 0) {
4045ff4c9194SMarkus Armbruster error_report("RDMA ERROR: Migration initialization failed");
4046329c9b10SDr. David Alan Gilbert return;
4047329c9b10SDr. David Alan Gilbert }
4048329c9b10SDr. David Alan Gilbert
404924ec68efSDr. David Alan Gilbert trace_qemu_rdma_accept_incoming_migration_accepted();
4050329c9b10SDr. David Alan Gilbert
405155cc1b59SLidong Chen if (rdma->is_return_path) {
405255cc1b59SLidong Chen return;
405355cc1b59SLidong Chen }
405455cc1b59SLidong Chen
4055697c4c86SJuan Quintela f = rdma_new_input(rdma);
4056329c9b10SDr. David Alan Gilbert if (f == NULL) {
4057ff4c9194SMarkus Armbruster error_report("RDMA ERROR: could not open RDMA for input");
4058329c9b10SDr. David Alan Gilbert qemu_rdma_cleanup(rdma);
4059329c9b10SDr. David Alan Gilbert return;
4060329c9b10SDr. David Alan Gilbert }
4061329c9b10SDr. David Alan Gilbert
4062329c9b10SDr. David Alan Gilbert rdma->migration_started_on_destination = 1;
4063b0cf3bfcSAvihai Horon migration_fd_process_incoming(f);
4064329c9b10SDr. David Alan Gilbert }
4065329c9b10SDr. David Alan Gilbert
rdma_start_incoming_migration(InetSocketAddress * host_port,Error ** errp)40663fa9642fSHet Gala void rdma_start_incoming_migration(InetSocketAddress *host_port,
40673fa9642fSHet Gala Error **errp)
4068329c9b10SDr. David Alan Gilbert {
406927fd25b0SJuan Quintela MigrationState *s = migrate_get_current();
4070329c9b10SDr. David Alan Gilbert int ret;
4071bf027419SLi Zhijian RDMAContext *rdma;
4072329c9b10SDr. David Alan Gilbert
4073733252deSDr. David Alan Gilbert trace_rdma_start_incoming_migration();
4074329c9b10SDr. David Alan Gilbert
40755f1f1902SDavid Hildenbrand /* Avoid ram_block_discard_disable(), cannot change during migration. */
40765f1f1902SDavid Hildenbrand if (ram_block_discard_is_required()) {
40775f1f1902SDavid Hildenbrand error_setg(errp, "RDMA: cannot disable RAM discard");
40785f1f1902SDavid Hildenbrand return;
40795f1f1902SDavid Hildenbrand }
40805f1f1902SDavid Hildenbrand
4081b16defbbSMarkus Armbruster rdma = qemu_rdma_data_init(host_port, errp);
4082329c9b10SDr. David Alan Gilbert if (rdma == NULL) {
4083329c9b10SDr. David Alan Gilbert goto err;
4084329c9b10SDr. David Alan Gilbert }
4085329c9b10SDr. David Alan Gilbert
4086b16defbbSMarkus Armbruster ret = qemu_rdma_dest_init(rdma, errp);
40874a102179SMarkus Armbruster if (ret < 0) {
4088329c9b10SDr. David Alan Gilbert goto err;
4089329c9b10SDr. David Alan Gilbert }
4090329c9b10SDr. David Alan Gilbert
4091733252deSDr. David Alan Gilbert trace_rdma_start_incoming_migration_after_dest_init();
4092329c9b10SDr. David Alan Gilbert
4093329c9b10SDr. David Alan Gilbert ret = rdma_listen(rdma->listen_id, 5);
4094329c9b10SDr. David Alan Gilbert
40954a102179SMarkus Armbruster if (ret < 0) {
40968fd471bdSMarkus Armbruster error_setg(errp, "RDMA ERROR: listening on socket!");
40974e812d23SLi Zhijian goto cleanup_rdma;
4098329c9b10SDr. David Alan Gilbert }
4099329c9b10SDr. David Alan Gilbert
4100733252deSDr. David Alan Gilbert trace_rdma_start_incoming_migration_after_rdma_listen();
410127fd25b0SJuan Quintela s->rdma_migration = true;
410282e1cc4bSFam Zheng qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
410382e1cc4bSFam Zheng NULL, (void *)(intptr_t)rdma);
4104329c9b10SDr. David Alan Gilbert return;
41054e812d23SLi Zhijian
41064e812d23SLi Zhijian cleanup_rdma:
41074e812d23SLi Zhijian qemu_rdma_cleanup(rdma);
4108329c9b10SDr. David Alan Gilbert err:
41093b59ee72SPan Nengyuan if (rdma) {
411059c59c67SPan Nengyuan g_free(rdma->host);
41113b59ee72SPan Nengyuan }
4112329c9b10SDr. David Alan Gilbert g_free(rdma);
4113329c9b10SDr. David Alan Gilbert }
4114329c9b10SDr. David Alan Gilbert
rdma_start_outgoing_migration(void * opaque,InetSocketAddress * host_port,Error ** errp)4115329c9b10SDr. David Alan Gilbert void rdma_start_outgoing_migration(void *opaque,
41163fa9642fSHet Gala InetSocketAddress *host_port, Error **errp)
4117329c9b10SDr. David Alan Gilbert {
4118329c9b10SDr. David Alan Gilbert MigrationState *s = opaque;
411955cc1b59SLidong Chen RDMAContext *rdma_return_path = NULL;
41205f1f1902SDavid Hildenbrand RDMAContext *rdma;
4121c0d77702SMarkus Armbruster int ret;
4122329c9b10SDr. David Alan Gilbert
41235f1f1902SDavid Hildenbrand /* Avoid ram_block_discard_disable(), cannot change during migration. */
41245f1f1902SDavid Hildenbrand if (ram_block_discard_is_required()) {
41255f1f1902SDavid Hildenbrand error_setg(errp, "RDMA: cannot disable RAM discard");
41265f1f1902SDavid Hildenbrand return;
41275f1f1902SDavid Hildenbrand }
41285f1f1902SDavid Hildenbrand
41295f1f1902SDavid Hildenbrand rdma = qemu_rdma_data_init(host_port, errp);
4130329c9b10SDr. David Alan Gilbert if (rdma == NULL) {
4131329c9b10SDr. David Alan Gilbert goto err;
4132329c9b10SDr. David Alan Gilbert }
4133329c9b10SDr. David Alan Gilbert
413417cba690SJuan Quintela ret = qemu_rdma_source_init(rdma, migrate_rdma_pin_all(), errp);
4135329c9b10SDr. David Alan Gilbert
41364a102179SMarkus Armbruster if (ret < 0) {
4137329c9b10SDr. David Alan Gilbert goto err;
4138329c9b10SDr. David Alan Gilbert }
4139329c9b10SDr. David Alan Gilbert
4140733252deSDr. David Alan Gilbert trace_rdma_start_outgoing_migration_after_rdma_source_init();
41413c03f21cSMarkus Armbruster ret = qemu_rdma_connect(rdma, false, errp);
4142329c9b10SDr. David Alan Gilbert
41434a102179SMarkus Armbruster if (ret < 0) {
4144329c9b10SDr. David Alan Gilbert goto err;
4145329c9b10SDr. David Alan Gilbert }
4146329c9b10SDr. David Alan Gilbert
41473a4452d8Szhaolichang /* RDMA postcopy need a separate queue pair for return path */
414838ad1110SJuan Quintela if (migrate_postcopy() || migrate_return_path()) {
414955cc1b59SLidong Chen rdma_return_path = qemu_rdma_data_init(host_port, errp);
415055cc1b59SLidong Chen
415155cc1b59SLidong Chen if (rdma_return_path == NULL) {
41522f0c285aSPan Nengyuan goto return_path_err;
415355cc1b59SLidong Chen }
415455cc1b59SLidong Chen
415555cc1b59SLidong Chen ret = qemu_rdma_source_init(rdma_return_path,
415617cba690SJuan Quintela migrate_rdma_pin_all(), errp);
415755cc1b59SLidong Chen
41584a102179SMarkus Armbruster if (ret < 0) {
41592f0c285aSPan Nengyuan goto return_path_err;
416055cc1b59SLidong Chen }
416155cc1b59SLidong Chen
41623c03f21cSMarkus Armbruster ret = qemu_rdma_connect(rdma_return_path, true, errp);
416355cc1b59SLidong Chen
41644a102179SMarkus Armbruster if (ret < 0) {
41652f0c285aSPan Nengyuan goto return_path_err;
416655cc1b59SLidong Chen }
416755cc1b59SLidong Chen
416855cc1b59SLidong Chen rdma->return_path = rdma_return_path;
416955cc1b59SLidong Chen rdma_return_path->return_path = rdma;
417055cc1b59SLidong Chen rdma_return_path->is_return_path = true;
417155cc1b59SLidong Chen }
417255cc1b59SLidong Chen
4173733252deSDr. David Alan Gilbert trace_rdma_start_outgoing_migration_after_rdma_connect();
4174329c9b10SDr. David Alan Gilbert
4175697c4c86SJuan Quintela s->to_dst_file = rdma_new_output(rdma);
417627fd25b0SJuan Quintela s->rdma_migration = true;
4177cce8040bSDr. David Alan Gilbert migrate_fd_connect(s, NULL);
4178329c9b10SDr. David Alan Gilbert return;
41792f0c285aSPan Nengyuan return_path_err:
41802f0c285aSPan Nengyuan qemu_rdma_cleanup(rdma);
4181329c9b10SDr. David Alan Gilbert err:
4182329c9b10SDr. David Alan Gilbert g_free(rdma);
418355cc1b59SLidong Chen g_free(rdma_return_path);
4184329c9b10SDr. David Alan Gilbert }
4185