xref: /openbmc/linux/net/rds/ib_cm.c (revision f20c7d91)
1 /*
2  * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  *
32  */
33 #include <linux/dmapool.h>
34 #include <linux/kernel.h>
35 #include <linux/in.h>
36 #include <linux/slab.h>
37 #include <linux/vmalloc.h>
38 #include <linux/ratelimit.h>
39 #include <net/addrconf.h>
40 #include <rdma/ib_cm.h>
41 
42 #include "rds_single_path.h"
43 #include "rds.h"
44 #include "ib.h"
45 #include "ib_mr.h"
46 
47 /*
48  * Set the selected protocol version
49  */
50 static void rds_ib_set_protocol(struct rds_connection *conn, unsigned int version)
51 {
52 	conn->c_version = version;
53 }
54 
55 /*
56  * Set up flow control
57  */
58 static void rds_ib_set_flow_control(struct rds_connection *conn, u32 credits)
59 {
60 	struct rds_ib_connection *ic = conn->c_transport_data;
61 
62 	if (rds_ib_sysctl_flow_control && credits != 0) {
63 		/* We're doing flow control */
64 		ic->i_flowctl = 1;
65 		rds_ib_send_add_credits(conn, credits);
66 	} else {
67 		ic->i_flowctl = 0;
68 	}
69 }
70 
71 /*
72  * Tune RNR behavior. Without flow control, we use a rather
73  * low timeout, but not the absolute minimum - this should
74  * be tunable.
75  *
76  * We already set the RNR retry count to 7 (which is the
77  * smallest infinite number :-) above.
78  * If flow control is off, we want to change this back to 0
79  * so that we learn quickly when our credit accounting is
80  * buggy.
81  *
82  * Caller passes in a qp_attr pointer - don't waste stack spacv
83  * by allocation this twice.
84  */
85 static void
86 rds_ib_tune_rnr(struct rds_ib_connection *ic, struct ib_qp_attr *attr)
87 {
88 	int ret;
89 
90 	attr->min_rnr_timer = IB_RNR_TIMER_000_32;
91 	ret = ib_modify_qp(ic->i_cm_id->qp, attr, IB_QP_MIN_RNR_TIMER);
92 	if (ret)
93 		printk(KERN_NOTICE "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d\n", -ret);
94 }
95 
96 /*
97  * Connection established.
98  * We get here for both outgoing and incoming connection.
99  */
100 void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
101 {
102 	struct rds_ib_connection *ic = conn->c_transport_data;
103 	const union rds_ib_conn_priv *dp = NULL;
104 	struct ib_qp_attr qp_attr;
105 	__be64 ack_seq = 0;
106 	__be32 credit = 0;
107 	u8 major = 0;
108 	u8 minor = 0;
109 	int err;
110 
111 	dp = event->param.conn.private_data;
112 	if (conn->c_isv6) {
113 		if (event->param.conn.private_data_len >=
114 		    sizeof(struct rds6_ib_connect_private)) {
115 			major = dp->ricp_v6.dp_protocol_major;
116 			minor = dp->ricp_v6.dp_protocol_minor;
117 			credit = dp->ricp_v6.dp_credit;
118 			/* dp structure start is not guaranteed to be 8 bytes
119 			 * aligned.  Since dp_ack_seq is 64-bit extended load
120 			 * operations can be used so go through get_unaligned
121 			 * to avoid unaligned errors.
122 			 */
123 			ack_seq = get_unaligned(&dp->ricp_v6.dp_ack_seq);
124 		}
125 	} else if (event->param.conn.private_data_len >=
126 		   sizeof(struct rds_ib_connect_private)) {
127 		major = dp->ricp_v4.dp_protocol_major;
128 		minor = dp->ricp_v4.dp_protocol_minor;
129 		credit = dp->ricp_v4.dp_credit;
130 		ack_seq = get_unaligned(&dp->ricp_v4.dp_ack_seq);
131 	}
132 
133 	/* make sure it isn't empty data */
134 	if (major) {
135 		rds_ib_set_protocol(conn, RDS_PROTOCOL(major, minor));
136 		rds_ib_set_flow_control(conn, be32_to_cpu(credit));
137 	}
138 
139 	if (conn->c_version < RDS_PROTOCOL_VERSION) {
140 		if (conn->c_version != RDS_PROTOCOL_COMPAT_VERSION) {
141 			pr_notice("RDS/IB: Connection <%pI6c,%pI6c> version %u.%u no longer supported\n",
142 				  &conn->c_laddr, &conn->c_faddr,
143 				  RDS_PROTOCOL_MAJOR(conn->c_version),
144 				  RDS_PROTOCOL_MINOR(conn->c_version));
145 			rds_conn_destroy(conn);
146 			return;
147 		}
148 	}
149 
150 	pr_notice("RDS/IB: %s conn connected <%pI6c,%pI6c,%d> version %u.%u%s\n",
151 		  ic->i_active_side ? "Active" : "Passive",
152 		  &conn->c_laddr, &conn->c_faddr, conn->c_tos,
153 		  RDS_PROTOCOL_MAJOR(conn->c_version),
154 		  RDS_PROTOCOL_MINOR(conn->c_version),
155 		  ic->i_flowctl ? ", flow control" : "");
156 
157 	/* receive sl from the peer */
158 	ic->i_sl = ic->i_cm_id->route.path_rec->sl;
159 
160 	atomic_set(&ic->i_cq_quiesce, 0);
161 
162 	/* Init rings and fill recv. this needs to wait until protocol
163 	 * negotiation is complete, since ring layout is different
164 	 * from 3.1 to 4.1.
165 	 */
166 	rds_ib_send_init_ring(ic);
167 	rds_ib_recv_init_ring(ic);
168 	/* Post receive buffers - as a side effect, this will update
169 	 * the posted credit count. */
170 	rds_ib_recv_refill(conn, 1, GFP_KERNEL);
171 
172 	/* Tune RNR behavior */
173 	rds_ib_tune_rnr(ic, &qp_attr);
174 
175 	qp_attr.qp_state = IB_QPS_RTS;
176 	err = ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE);
177 	if (err)
178 		printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err);
179 
180 	/* update ib_device with this local ipaddr */
181 	err = rds_ib_update_ipaddr(ic->rds_ibdev, &conn->c_laddr);
182 	if (err)
183 		printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n",
184 			err);
185 
186 	/* If the peer gave us the last packet it saw, process this as if
187 	 * we had received a regular ACK. */
188 	if (dp) {
189 		if (ack_seq)
190 			rds_send_drop_acked(conn, be64_to_cpu(ack_seq),
191 					    NULL);
192 	}
193 
194 	conn->c_proposed_version = conn->c_version;
195 	rds_connect_complete(conn);
196 }
197 
198 static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
199 				      struct rdma_conn_param *conn_param,
200 				      union rds_ib_conn_priv *dp,
201 				      u32 protocol_version,
202 				      u32 max_responder_resources,
203 				      u32 max_initiator_depth,
204 				      bool isv6)
205 {
206 	struct rds_ib_connection *ic = conn->c_transport_data;
207 	struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
208 
209 	memset(conn_param, 0, sizeof(struct rdma_conn_param));
210 
211 	conn_param->responder_resources =
212 		min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources);
213 	conn_param->initiator_depth =
214 		min_t(u32, rds_ibdev->max_initiator_depth, max_initiator_depth);
215 	conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7);
216 	conn_param->rnr_retry_count = 7;
217 
218 	if (dp) {
219 		memset(dp, 0, sizeof(*dp));
220 		if (isv6) {
221 			dp->ricp_v6.dp_saddr = conn->c_laddr;
222 			dp->ricp_v6.dp_daddr = conn->c_faddr;
223 			dp->ricp_v6.dp_protocol_major =
224 			    RDS_PROTOCOL_MAJOR(protocol_version);
225 			dp->ricp_v6.dp_protocol_minor =
226 			    RDS_PROTOCOL_MINOR(protocol_version);
227 			dp->ricp_v6.dp_protocol_minor_mask =
228 			    cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
229 			dp->ricp_v6.dp_ack_seq =
230 			    cpu_to_be64(rds_ib_piggyb_ack(ic));
231 			dp->ricp_v6.dp_cmn.ricpc_dp_toss = conn->c_tos;
232 
233 			conn_param->private_data = &dp->ricp_v6;
234 			conn_param->private_data_len = sizeof(dp->ricp_v6);
235 		} else {
236 			dp->ricp_v4.dp_saddr = conn->c_laddr.s6_addr32[3];
237 			dp->ricp_v4.dp_daddr = conn->c_faddr.s6_addr32[3];
238 			dp->ricp_v4.dp_protocol_major =
239 			    RDS_PROTOCOL_MAJOR(protocol_version);
240 			dp->ricp_v4.dp_protocol_minor =
241 			    RDS_PROTOCOL_MINOR(protocol_version);
242 			dp->ricp_v4.dp_protocol_minor_mask =
243 			    cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
244 			dp->ricp_v4.dp_ack_seq =
245 			    cpu_to_be64(rds_ib_piggyb_ack(ic));
246 			dp->ricp_v4.dp_cmn.ricpc_dp_toss = conn->c_tos;
247 
248 			conn_param->private_data = &dp->ricp_v4;
249 			conn_param->private_data_len = sizeof(dp->ricp_v4);
250 		}
251 
252 		/* Advertise flow control */
253 		if (ic->i_flowctl) {
254 			unsigned int credits;
255 
256 			credits = IB_GET_POST_CREDITS
257 				(atomic_read(&ic->i_credits));
258 			if (isv6)
259 				dp->ricp_v6.dp_credit = cpu_to_be32(credits);
260 			else
261 				dp->ricp_v4.dp_credit = cpu_to_be32(credits);
262 			atomic_sub(IB_SET_POST_CREDITS(credits),
263 				   &ic->i_credits);
264 		}
265 	}
266 }
267 
268 static void rds_ib_cq_event_handler(struct ib_event *event, void *data)
269 {
270 	rdsdebug("event %u (%s) data %p\n",
271 		 event->event, ib_event_msg(event->event), data);
272 }
273 
274 /* Plucking the oldest entry from the ring can be done concurrently with
275  * the thread refilling the ring.  Each ring operation is protected by
276  * spinlocks and the transient state of refilling doesn't change the
277  * recording of which entry is oldest.
278  *
279  * This relies on IB only calling one cq comp_handler for each cq so that
280  * there will only be one caller of rds_recv_incoming() per RDS connection.
281  */
282 static void rds_ib_cq_comp_handler_recv(struct ib_cq *cq, void *context)
283 {
284 	struct rds_connection *conn = context;
285 	struct rds_ib_connection *ic = conn->c_transport_data;
286 
287 	rdsdebug("conn %p cq %p\n", conn, cq);
288 
289 	rds_ib_stats_inc(s_ib_evt_handler_call);
290 
291 	tasklet_schedule(&ic->i_recv_tasklet);
292 }
293 
294 static void poll_scq(struct rds_ib_connection *ic, struct ib_cq *cq,
295 		     struct ib_wc *wcs)
296 {
297 	int nr, i;
298 	struct ib_wc *wc;
299 
300 	while ((nr = ib_poll_cq(cq, RDS_IB_WC_MAX, wcs)) > 0) {
301 		for (i = 0; i < nr; i++) {
302 			wc = wcs + i;
303 			rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
304 				 (unsigned long long)wc->wr_id, wc->status,
305 				 wc->byte_len, be32_to_cpu(wc->ex.imm_data));
306 
307 			if (wc->wr_id <= ic->i_send_ring.w_nr ||
308 			    wc->wr_id == RDS_IB_ACK_WR_ID)
309 				rds_ib_send_cqe_handler(ic, wc);
310 			else
311 				rds_ib_mr_cqe_handler(ic, wc);
312 
313 		}
314 	}
315 }
316 
317 static void rds_ib_tasklet_fn_send(unsigned long data)
318 {
319 	struct rds_ib_connection *ic = (struct rds_ib_connection *)data;
320 	struct rds_connection *conn = ic->conn;
321 
322 	rds_ib_stats_inc(s_ib_tasklet_call);
323 
324 	/* if cq has been already reaped, ignore incoming cq event */
325 	if (atomic_read(&ic->i_cq_quiesce))
326 		return;
327 
328 	poll_scq(ic, ic->i_send_cq, ic->i_send_wc);
329 	ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
330 	poll_scq(ic, ic->i_send_cq, ic->i_send_wc);
331 
332 	if (rds_conn_up(conn) &&
333 	    (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
334 	    test_bit(0, &conn->c_map_queued)))
335 		rds_send_xmit(&ic->conn->c_path[0]);
336 }
337 
338 static void poll_rcq(struct rds_ib_connection *ic, struct ib_cq *cq,
339 		     struct ib_wc *wcs,
340 		     struct rds_ib_ack_state *ack_state)
341 {
342 	int nr, i;
343 	struct ib_wc *wc;
344 
345 	while ((nr = ib_poll_cq(cq, RDS_IB_WC_MAX, wcs)) > 0) {
346 		for (i = 0; i < nr; i++) {
347 			wc = wcs + i;
348 			rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
349 				 (unsigned long long)wc->wr_id, wc->status,
350 				 wc->byte_len, be32_to_cpu(wc->ex.imm_data));
351 
352 			rds_ib_recv_cqe_handler(ic, wc, ack_state);
353 		}
354 	}
355 }
356 
357 static void rds_ib_tasklet_fn_recv(unsigned long data)
358 {
359 	struct rds_ib_connection *ic = (struct rds_ib_connection *)data;
360 	struct rds_connection *conn = ic->conn;
361 	struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
362 	struct rds_ib_ack_state state;
363 
364 	if (!rds_ibdev)
365 		rds_conn_drop(conn);
366 
367 	rds_ib_stats_inc(s_ib_tasklet_call);
368 
369 	/* if cq has been already reaped, ignore incoming cq event */
370 	if (atomic_read(&ic->i_cq_quiesce))
371 		return;
372 
373 	memset(&state, 0, sizeof(state));
374 	poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
375 	ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
376 	poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
377 
378 	if (state.ack_next_valid)
379 		rds_ib_set_ack(ic, state.ack_next, state.ack_required);
380 	if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) {
381 		rds_send_drop_acked(conn, state.ack_recv, NULL);
382 		ic->i_ack_recv = state.ack_recv;
383 	}
384 
385 	if (rds_conn_up(conn))
386 		rds_ib_attempt_ack(ic);
387 }
388 
389 static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
390 {
391 	struct rds_connection *conn = data;
392 	struct rds_ib_connection *ic = conn->c_transport_data;
393 
394 	rdsdebug("conn %p ic %p event %u (%s)\n", conn, ic, event->event,
395 		 ib_event_msg(event->event));
396 
397 	switch (event->event) {
398 	case IB_EVENT_COMM_EST:
399 		rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
400 		break;
401 	default:
402 		rdsdebug("Fatal QP Event %u (%s) - connection %pI6c->%pI6c, reconnecting\n",
403 			 event->event, ib_event_msg(event->event),
404 			 &conn->c_laddr, &conn->c_faddr);
405 		rds_conn_drop(conn);
406 		break;
407 	}
408 }
409 
410 static void rds_ib_cq_comp_handler_send(struct ib_cq *cq, void *context)
411 {
412 	struct rds_connection *conn = context;
413 	struct rds_ib_connection *ic = conn->c_transport_data;
414 
415 	rdsdebug("conn %p cq %p\n", conn, cq);
416 
417 	rds_ib_stats_inc(s_ib_evt_handler_call);
418 
419 	tasklet_schedule(&ic->i_send_tasklet);
420 }
421 
422 static inline int ibdev_get_unused_vector(struct rds_ib_device *rds_ibdev)
423 {
424 	int min = rds_ibdev->vector_load[rds_ibdev->dev->num_comp_vectors - 1];
425 	int index = rds_ibdev->dev->num_comp_vectors - 1;
426 	int i;
427 
428 	for (i = rds_ibdev->dev->num_comp_vectors - 1; i >= 0; i--) {
429 		if (rds_ibdev->vector_load[i] < min) {
430 			index = i;
431 			min = rds_ibdev->vector_load[i];
432 		}
433 	}
434 
435 	rds_ibdev->vector_load[index]++;
436 	return index;
437 }
438 
439 static inline void ibdev_put_vector(struct rds_ib_device *rds_ibdev, int index)
440 {
441 	rds_ibdev->vector_load[index]--;
442 }
443 
444 /* Allocate DMA coherent memory to be used to store struct rds_header for
445  * sending/receiving packets.  The pointers to the DMA memory and the
446  * associated DMA addresses are stored in two arrays.
447  *
448  * @ibdev: the IB device
449  * @pool: the DMA memory pool
450  * @dma_addrs: pointer to the array for storing DMA addresses
451  * @num_hdrs: number of headers to allocate
452  *
453  * It returns the pointer to the array storing the DMA memory pointers.  On
454  * error, NULL pointer is returned.
455  */
456 struct rds_header **rds_dma_hdrs_alloc(struct ib_device *ibdev,
457 				       struct dma_pool *pool,
458 				       dma_addr_t **dma_addrs, u32 num_hdrs)
459 {
460 	struct rds_header **hdrs;
461 	dma_addr_t *hdr_daddrs;
462 	u32 i;
463 
464 	hdrs = kvmalloc_node(sizeof(*hdrs) * num_hdrs, GFP_KERNEL,
465 			     ibdev_to_node(ibdev));
466 	if (!hdrs)
467 		return NULL;
468 
469 	hdr_daddrs = kvmalloc_node(sizeof(*hdr_daddrs) * num_hdrs, GFP_KERNEL,
470 				   ibdev_to_node(ibdev));
471 	if (!hdr_daddrs) {
472 		kvfree(hdrs);
473 		return NULL;
474 	}
475 
476 	for (i = 0; i < num_hdrs; i++) {
477 		hdrs[i] = dma_pool_zalloc(pool, GFP_KERNEL, &hdr_daddrs[i]);
478 		if (!hdrs[i]) {
479 			rds_dma_hdrs_free(pool, hdrs, hdr_daddrs, i);
480 			return NULL;
481 		}
482 	}
483 
484 	*dma_addrs = hdr_daddrs;
485 	return hdrs;
486 }
487 
488 /* Free the DMA memory used to store struct rds_header.
489  *
490  * @pool: the DMA memory pool
491  * @hdrs: pointer to the array storing DMA memory pointers
492  * @dma_addrs: pointer to the array storing DMA addresses
493  * @num_hdars: number of headers to free.
494  */
495 void rds_dma_hdrs_free(struct dma_pool *pool, struct rds_header **hdrs,
496 		       dma_addr_t *dma_addrs, u32 num_hdrs)
497 {
498 	u32 i;
499 
500 	for (i = 0; i < num_hdrs; i++)
501 		dma_pool_free(pool, hdrs[i], dma_addrs[i]);
502 	kvfree(hdrs);
503 	kvfree(dma_addrs);
504 }
505 
506 /*
507  * This needs to be very careful to not leave IS_ERR pointers around for
508  * cleanup to trip over.
509  */
510 static int rds_ib_setup_qp(struct rds_connection *conn)
511 {
512 	struct rds_ib_connection *ic = conn->c_transport_data;
513 	struct ib_device *dev = ic->i_cm_id->device;
514 	struct ib_qp_init_attr attr;
515 	struct ib_cq_init_attr cq_attr = {};
516 	struct rds_ib_device *rds_ibdev;
517 	unsigned long max_wrs;
518 	int ret, fr_queue_space;
519 	struct dma_pool *pool;
520 
521 	/*
522 	 * It's normal to see a null device if an incoming connection races
523 	 * with device removal, so we don't print a warning.
524 	 */
525 	rds_ibdev = rds_ib_get_client_data(dev);
526 	if (!rds_ibdev)
527 		return -EOPNOTSUPP;
528 
529 	/* The fr_queue_space is currently set to 512, to add extra space on
530 	 * completion queue and send queue. This extra space is used for FRWR
531 	 * registration and invalidation work requests
532 	 */
533 	fr_queue_space = RDS_IB_DEFAULT_FR_WR;
534 
535 	/* add the conn now so that connection establishment has the dev */
536 	rds_ib_add_conn(rds_ibdev, conn);
537 
538 	max_wrs = rds_ibdev->max_wrs < rds_ib_sysctl_max_send_wr + 1 ?
539 		rds_ibdev->max_wrs - 1 : rds_ib_sysctl_max_send_wr;
540 	if (ic->i_send_ring.w_nr != max_wrs)
541 		rds_ib_ring_resize(&ic->i_send_ring, max_wrs);
542 
543 	max_wrs = rds_ibdev->max_wrs < rds_ib_sysctl_max_recv_wr + 1 ?
544 		rds_ibdev->max_wrs - 1 : rds_ib_sysctl_max_recv_wr;
545 	if (ic->i_recv_ring.w_nr != max_wrs)
546 		rds_ib_ring_resize(&ic->i_recv_ring, max_wrs);
547 
548 	/* Protection domain and memory range */
549 	ic->i_pd = rds_ibdev->pd;
550 
551 	ic->i_scq_vector = ibdev_get_unused_vector(rds_ibdev);
552 	cq_attr.cqe = ic->i_send_ring.w_nr + fr_queue_space + 1;
553 	cq_attr.comp_vector = ic->i_scq_vector;
554 	ic->i_send_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_send,
555 				     rds_ib_cq_event_handler, conn,
556 				     &cq_attr);
557 	if (IS_ERR(ic->i_send_cq)) {
558 		ret = PTR_ERR(ic->i_send_cq);
559 		ic->i_send_cq = NULL;
560 		ibdev_put_vector(rds_ibdev, ic->i_scq_vector);
561 		rdsdebug("ib_create_cq send failed: %d\n", ret);
562 		goto rds_ibdev_out;
563 	}
564 
565 	ic->i_rcq_vector = ibdev_get_unused_vector(rds_ibdev);
566 	cq_attr.cqe = ic->i_recv_ring.w_nr;
567 	cq_attr.comp_vector = ic->i_rcq_vector;
568 	ic->i_recv_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_recv,
569 				     rds_ib_cq_event_handler, conn,
570 				     &cq_attr);
571 	if (IS_ERR(ic->i_recv_cq)) {
572 		ret = PTR_ERR(ic->i_recv_cq);
573 		ic->i_recv_cq = NULL;
574 		ibdev_put_vector(rds_ibdev, ic->i_rcq_vector);
575 		rdsdebug("ib_create_cq recv failed: %d\n", ret);
576 		goto send_cq_out;
577 	}
578 
579 	ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
580 	if (ret) {
581 		rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
582 		goto recv_cq_out;
583 	}
584 
585 	ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
586 	if (ret) {
587 		rdsdebug("ib_req_notify_cq recv failed: %d\n", ret);
588 		goto recv_cq_out;
589 	}
590 
591 	/* XXX negotiate max send/recv with remote? */
592 	memset(&attr, 0, sizeof(attr));
593 	attr.event_handler = rds_ib_qp_event_handler;
594 	attr.qp_context = conn;
595 	/* + 1 to allow for the single ack message */
596 	attr.cap.max_send_wr = ic->i_send_ring.w_nr + fr_queue_space + 1;
597 	attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1;
598 	attr.cap.max_send_sge = rds_ibdev->max_sge;
599 	attr.cap.max_recv_sge = RDS_IB_RECV_SGE;
600 	attr.sq_sig_type = IB_SIGNAL_REQ_WR;
601 	attr.qp_type = IB_QPT_RC;
602 	attr.send_cq = ic->i_send_cq;
603 	attr.recv_cq = ic->i_recv_cq;
604 
605 	/*
606 	 * XXX this can fail if max_*_wr is too large?  Are we supposed
607 	 * to back off until we get a value that the hardware can support?
608 	 */
609 	ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
610 	if (ret) {
611 		rdsdebug("rdma_create_qp failed: %d\n", ret);
612 		goto recv_cq_out;
613 	}
614 
615 	pool = rds_ibdev->rid_hdrs_pool;
616 	ic->i_send_hdrs = rds_dma_hdrs_alloc(dev, pool, &ic->i_send_hdrs_dma,
617 					     ic->i_send_ring.w_nr);
618 	if (!ic->i_send_hdrs) {
619 		ret = -ENOMEM;
620 		rdsdebug("DMA send hdrs alloc failed\n");
621 		goto qp_out;
622 	}
623 
624 	ic->i_recv_hdrs = rds_dma_hdrs_alloc(dev, pool, &ic->i_recv_hdrs_dma,
625 					     ic->i_recv_ring.w_nr);
626 	if (!ic->i_recv_hdrs) {
627 		ret = -ENOMEM;
628 		rdsdebug("DMA recv hdrs alloc failed\n");
629 		goto send_hdrs_dma_out;
630 	}
631 
632 	ic->i_ack = dma_pool_zalloc(pool, GFP_KERNEL,
633 				    &ic->i_ack_dma);
634 	if (!ic->i_ack) {
635 		ret = -ENOMEM;
636 		rdsdebug("DMA ack header alloc failed\n");
637 		goto recv_hdrs_dma_out;
638 	}
639 
640 	ic->i_sends = vzalloc_node(array_size(sizeof(struct rds_ib_send_work),
641 					      ic->i_send_ring.w_nr),
642 				   ibdev_to_node(dev));
643 	if (!ic->i_sends) {
644 		ret = -ENOMEM;
645 		rdsdebug("send allocation failed\n");
646 		goto ack_dma_out;
647 	}
648 
649 	ic->i_recvs = vzalloc_node(array_size(sizeof(struct rds_ib_recv_work),
650 					      ic->i_recv_ring.w_nr),
651 				   ibdev_to_node(dev));
652 	if (!ic->i_recvs) {
653 		ret = -ENOMEM;
654 		rdsdebug("recv allocation failed\n");
655 		goto sends_out;
656 	}
657 
658 	rds_ib_recv_init_ack(ic);
659 
660 	rdsdebug("conn %p pd %p cq %p %p\n", conn, ic->i_pd,
661 		 ic->i_send_cq, ic->i_recv_cq);
662 
663 	goto out;
664 
665 sends_out:
666 	vfree(ic->i_sends);
667 
668 ack_dma_out:
669 	dma_pool_free(pool, ic->i_ack, ic->i_ack_dma);
670 	ic->i_ack = NULL;
671 
672 recv_hdrs_dma_out:
673 	rds_dma_hdrs_free(pool, ic->i_recv_hdrs, ic->i_recv_hdrs_dma,
674 			  ic->i_recv_ring.w_nr);
675 	ic->i_recv_hdrs = NULL;
676 	ic->i_recv_hdrs_dma = NULL;
677 
678 send_hdrs_dma_out:
679 	rds_dma_hdrs_free(pool, ic->i_send_hdrs, ic->i_send_hdrs_dma,
680 			  ic->i_send_ring.w_nr);
681 	ic->i_send_hdrs = NULL;
682 	ic->i_send_hdrs_dma = NULL;
683 
684 qp_out:
685 	rdma_destroy_qp(ic->i_cm_id);
686 recv_cq_out:
687 	ib_destroy_cq(ic->i_recv_cq);
688 	ic->i_recv_cq = NULL;
689 send_cq_out:
690 	ib_destroy_cq(ic->i_send_cq);
691 	ic->i_send_cq = NULL;
692 rds_ibdev_out:
693 	rds_ib_remove_conn(rds_ibdev, conn);
694 out:
695 	rds_ib_dev_put(rds_ibdev);
696 
697 	return ret;
698 }
699 
700 static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event, bool isv6)
701 {
702 	const union rds_ib_conn_priv *dp = event->param.conn.private_data;
703 	u8 data_len, major, minor;
704 	u32 version = 0;
705 	__be16 mask;
706 	u16 common;
707 
708 	/*
709 	 * rdma_cm private data is odd - when there is any private data in the
710 	 * request, we will be given a pretty large buffer without telling us the
711 	 * original size. The only way to tell the difference is by looking at
712 	 * the contents, which are initialized to zero.
713 	 * If the protocol version fields aren't set, this is a connection attempt
714 	 * from an older version. This could could be 3.0 or 2.0 - we can't tell.
715 	 * We really should have changed this for OFED 1.3 :-(
716 	 */
717 
718 	/* Be paranoid. RDS always has privdata */
719 	if (!event->param.conn.private_data_len) {
720 		printk(KERN_NOTICE "RDS incoming connection has no private data, "
721 			"rejecting\n");
722 		return 0;
723 	}
724 
725 	if (isv6) {
726 		data_len = sizeof(struct rds6_ib_connect_private);
727 		major = dp->ricp_v6.dp_protocol_major;
728 		minor = dp->ricp_v6.dp_protocol_minor;
729 		mask = dp->ricp_v6.dp_protocol_minor_mask;
730 	} else {
731 		data_len = sizeof(struct rds_ib_connect_private);
732 		major = dp->ricp_v4.dp_protocol_major;
733 		minor = dp->ricp_v4.dp_protocol_minor;
734 		mask = dp->ricp_v4.dp_protocol_minor_mask;
735 	}
736 
737 	/* Even if len is crap *now* I still want to check it. -ASG */
738 	if (event->param.conn.private_data_len < data_len || major == 0)
739 		return RDS_PROTOCOL_4_0;
740 
741 	common = be16_to_cpu(mask) & RDS_IB_SUPPORTED_PROTOCOLS;
742 	if (major == 4 && common) {
743 		version = RDS_PROTOCOL_4_0;
744 		while ((common >>= 1) != 0)
745 			version++;
746 	} else if (RDS_PROTOCOL_COMPAT_VERSION ==
747 		   RDS_PROTOCOL(major, minor)) {
748 		version = RDS_PROTOCOL_COMPAT_VERSION;
749 	} else {
750 		if (isv6)
751 			printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI6c using incompatible protocol version %u.%u\n",
752 					   &dp->ricp_v6.dp_saddr, major, minor);
753 		else
754 			printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n",
755 					   &dp->ricp_v4.dp_saddr, major, minor);
756 	}
757 	return version;
758 }
759 
760 #if IS_ENABLED(CONFIG_IPV6)
761 /* Given an IPv6 address, find the net_device which hosts that address and
762  * return its index.  This is used by the rds_ib_cm_handle_connect() code to
763  * find the interface index of where an incoming request comes from when
764  * the request is using a link local address.
765  *
766  * Note one problem in this search.  It is possible that two interfaces have
767  * the same link local address.  Unfortunately, this cannot be solved unless
768  * the underlying layer gives us the interface which an incoming RDMA connect
769  * request comes from.
770  */
771 static u32 __rds_find_ifindex(struct net *net, const struct in6_addr *addr)
772 {
773 	struct net_device *dev;
774 	int idx = 0;
775 
776 	rcu_read_lock();
777 	for_each_netdev_rcu(net, dev) {
778 		if (ipv6_chk_addr(net, addr, dev, 1)) {
779 			idx = dev->ifindex;
780 			break;
781 		}
782 	}
783 	rcu_read_unlock();
784 
785 	return idx;
786 }
787 #endif
788 
789 int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
790 			     struct rdma_cm_event *event, bool isv6)
791 {
792 	__be64 lguid = cm_id->route.path_rec->sgid.global.interface_id;
793 	__be64 fguid = cm_id->route.path_rec->dgid.global.interface_id;
794 	const struct rds_ib_conn_priv_cmn *dp_cmn;
795 	struct rds_connection *conn = NULL;
796 	struct rds_ib_connection *ic = NULL;
797 	struct rdma_conn_param conn_param;
798 	const union rds_ib_conn_priv *dp;
799 	union rds_ib_conn_priv dp_rep;
800 	struct in6_addr s_mapped_addr;
801 	struct in6_addr d_mapped_addr;
802 	const struct in6_addr *saddr6;
803 	const struct in6_addr *daddr6;
804 	int destroy = 1;
805 	u32 ifindex = 0;
806 	u32 version;
807 	int err = 1;
808 
809 	/* Check whether the remote protocol version matches ours. */
810 	version = rds_ib_protocol_compatible(event, isv6);
811 	if (!version) {
812 		err = RDS_RDMA_REJ_INCOMPAT;
813 		goto out;
814 	}
815 
816 	dp = event->param.conn.private_data;
817 	if (isv6) {
818 #if IS_ENABLED(CONFIG_IPV6)
819 		dp_cmn = &dp->ricp_v6.dp_cmn;
820 		saddr6 = &dp->ricp_v6.dp_saddr;
821 		daddr6 = &dp->ricp_v6.dp_daddr;
822 		/* If either address is link local, need to find the
823 		 * interface index in order to create a proper RDS
824 		 * connection.
825 		 */
826 		if (ipv6_addr_type(daddr6) & IPV6_ADDR_LINKLOCAL) {
827 			/* Using init_net for now ..  */
828 			ifindex = __rds_find_ifindex(&init_net, daddr6);
829 			/* No index found...  Need to bail out. */
830 			if (ifindex == 0) {
831 				err = -EOPNOTSUPP;
832 				goto out;
833 			}
834 		} else if (ipv6_addr_type(saddr6) & IPV6_ADDR_LINKLOCAL) {
835 			/* Use our address to find the correct index. */
836 			ifindex = __rds_find_ifindex(&init_net, daddr6);
837 			/* No index found...  Need to bail out. */
838 			if (ifindex == 0) {
839 				err = -EOPNOTSUPP;
840 				goto out;
841 			}
842 		}
843 #else
844 		err = -EOPNOTSUPP;
845 		goto out;
846 #endif
847 	} else {
848 		dp_cmn = &dp->ricp_v4.dp_cmn;
849 		ipv6_addr_set_v4mapped(dp->ricp_v4.dp_saddr, &s_mapped_addr);
850 		ipv6_addr_set_v4mapped(dp->ricp_v4.dp_daddr, &d_mapped_addr);
851 		saddr6 = &s_mapped_addr;
852 		daddr6 = &d_mapped_addr;
853 	}
854 
855 	rdsdebug("saddr %pI6c daddr %pI6c RDSv%u.%u lguid 0x%llx fguid 0x%llx, tos:%d\n",
856 		 saddr6, daddr6, RDS_PROTOCOL_MAJOR(version),
857 		 RDS_PROTOCOL_MINOR(version),
858 		 (unsigned long long)be64_to_cpu(lguid),
859 		 (unsigned long long)be64_to_cpu(fguid), dp_cmn->ricpc_dp_toss);
860 
861 	/* RDS/IB is not currently netns aware, thus init_net */
862 	conn = rds_conn_create(&init_net, daddr6, saddr6,
863 			       &rds_ib_transport, dp_cmn->ricpc_dp_toss,
864 			       GFP_KERNEL, ifindex);
865 	if (IS_ERR(conn)) {
866 		rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
867 		conn = NULL;
868 		goto out;
869 	}
870 
871 	/*
872 	 * The connection request may occur while the
873 	 * previous connection exist, e.g. in case of failover.
874 	 * But as connections may be initiated simultaneously
875 	 * by both hosts, we have a random backoff mechanism -
876 	 * see the comment above rds_queue_reconnect()
877 	 */
878 	mutex_lock(&conn->c_cm_lock);
879 	if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
880 		if (rds_conn_state(conn) == RDS_CONN_UP) {
881 			rdsdebug("incoming connect while connecting\n");
882 			rds_conn_drop(conn);
883 			rds_ib_stats_inc(s_ib_listen_closed_stale);
884 		} else
885 		if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
886 			/* Wait and see - our connect may still be succeeding */
887 			rds_ib_stats_inc(s_ib_connect_raced);
888 		}
889 		goto out;
890 	}
891 
892 	ic = conn->c_transport_data;
893 
894 	rds_ib_set_protocol(conn, version);
895 	rds_ib_set_flow_control(conn, be32_to_cpu(dp_cmn->ricpc_credit));
896 
897 	/* If the peer gave us the last packet it saw, process this as if
898 	 * we had received a regular ACK. */
899 	if (dp_cmn->ricpc_ack_seq)
900 		rds_send_drop_acked(conn, be64_to_cpu(dp_cmn->ricpc_ack_seq),
901 				    NULL);
902 
903 	BUG_ON(cm_id->context);
904 	BUG_ON(ic->i_cm_id);
905 
906 	ic->i_cm_id = cm_id;
907 	cm_id->context = conn;
908 
909 	/* We got halfway through setting up the ib_connection, if we
910 	 * fail now, we have to take the long route out of this mess. */
911 	destroy = 0;
912 
913 	err = rds_ib_setup_qp(conn);
914 	if (err) {
915 		rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err);
916 		goto out;
917 	}
918 
919 	rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version,
920 				  event->param.conn.responder_resources,
921 				  event->param.conn.initiator_depth, isv6);
922 
923 	/* rdma_accept() calls rdma_reject() internally if it fails */
924 	if (rdma_accept(cm_id, &conn_param))
925 		rds_ib_conn_error(conn, "rdma_accept failed\n");
926 
927 out:
928 	if (conn)
929 		mutex_unlock(&conn->c_cm_lock);
930 	if (err)
931 		rdma_reject(cm_id, &err, sizeof(int),
932 			    IB_CM_REJ_CONSUMER_DEFINED);
933 	return destroy;
934 }
935 
936 
937 int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6)
938 {
939 	struct rds_connection *conn = cm_id->context;
940 	struct rds_ib_connection *ic = conn->c_transport_data;
941 	struct rdma_conn_param conn_param;
942 	union rds_ib_conn_priv dp;
943 	int ret;
944 
945 	/* If the peer doesn't do protocol negotiation, we must
946 	 * default to RDSv3.0 */
947 	rds_ib_set_protocol(conn, RDS_PROTOCOL_4_1);
948 	ic->i_flowctl = rds_ib_sysctl_flow_control;	/* advertise flow control */
949 
950 	ret = rds_ib_setup_qp(conn);
951 	if (ret) {
952 		rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", ret);
953 		goto out;
954 	}
955 
956 	rds_ib_cm_fill_conn_param(conn, &conn_param, &dp,
957 				  conn->c_proposed_version,
958 				  UINT_MAX, UINT_MAX, isv6);
959 	ret = rdma_connect(cm_id, &conn_param);
960 	if (ret)
961 		rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
962 
963 out:
964 	/* Beware - returning non-zero tells the rdma_cm to destroy
965 	 * the cm_id. We should certainly not do it as long as we still
966 	 * "own" the cm_id. */
967 	if (ret) {
968 		if (ic->i_cm_id == cm_id)
969 			ret = 0;
970 	}
971 	ic->i_active_side = true;
972 	return ret;
973 }
974 
975 int rds_ib_conn_path_connect(struct rds_conn_path *cp)
976 {
977 	struct rds_connection *conn = cp->cp_conn;
978 	struct sockaddr_storage src, dest;
979 	rdma_cm_event_handler handler;
980 	struct rds_ib_connection *ic;
981 	int ret;
982 
983 	ic = conn->c_transport_data;
984 
985 	/* XXX I wonder what affect the port space has */
986 	/* delegate cm event handler to rdma_transport */
987 #if IS_ENABLED(CONFIG_IPV6)
988 	if (conn->c_isv6)
989 		handler = rds6_rdma_cm_event_handler;
990 	else
991 #endif
992 		handler = rds_rdma_cm_event_handler;
993 	ic->i_cm_id = rdma_create_id(&init_net, handler, conn,
994 				     RDMA_PS_TCP, IB_QPT_RC);
995 	if (IS_ERR(ic->i_cm_id)) {
996 		ret = PTR_ERR(ic->i_cm_id);
997 		ic->i_cm_id = NULL;
998 		rdsdebug("rdma_create_id() failed: %d\n", ret);
999 		goto out;
1000 	}
1001 
1002 	rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
1003 
1004 	if (ipv6_addr_v4mapped(&conn->c_faddr)) {
1005 		struct sockaddr_in *sin;
1006 
1007 		sin = (struct sockaddr_in *)&src;
1008 		sin->sin_family = AF_INET;
1009 		sin->sin_addr.s_addr = conn->c_laddr.s6_addr32[3];
1010 		sin->sin_port = 0;
1011 
1012 		sin = (struct sockaddr_in *)&dest;
1013 		sin->sin_family = AF_INET;
1014 		sin->sin_addr.s_addr = conn->c_faddr.s6_addr32[3];
1015 		sin->sin_port = htons(RDS_PORT);
1016 	} else {
1017 		struct sockaddr_in6 *sin6;
1018 
1019 		sin6 = (struct sockaddr_in6 *)&src;
1020 		sin6->sin6_family = AF_INET6;
1021 		sin6->sin6_addr = conn->c_laddr;
1022 		sin6->sin6_port = 0;
1023 		sin6->sin6_scope_id = conn->c_dev_if;
1024 
1025 		sin6 = (struct sockaddr_in6 *)&dest;
1026 		sin6->sin6_family = AF_INET6;
1027 		sin6->sin6_addr = conn->c_faddr;
1028 		sin6->sin6_port = htons(RDS_CM_PORT);
1029 		sin6->sin6_scope_id = conn->c_dev_if;
1030 	}
1031 
1032 	ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
1033 				(struct sockaddr *)&dest,
1034 				RDS_RDMA_RESOLVE_TIMEOUT_MS);
1035 	if (ret) {
1036 		rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id,
1037 			 ret);
1038 		rdma_destroy_id(ic->i_cm_id);
1039 		ic->i_cm_id = NULL;
1040 	}
1041 
1042 out:
1043 	return ret;
1044 }
1045 
1046 /*
1047  * This is so careful about only cleaning up resources that were built up
1048  * so that it can be called at any point during startup.  In fact it
1049  * can be called multiple times for a given connection.
1050  */
1051 void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
1052 {
1053 	struct rds_connection *conn = cp->cp_conn;
1054 	struct rds_ib_connection *ic = conn->c_transport_data;
1055 	int err = 0;
1056 
1057 	rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id,
1058 		 ic->i_pd, ic->i_send_cq, ic->i_recv_cq,
1059 		 ic->i_cm_id ? ic->i_cm_id->qp : NULL);
1060 
1061 	if (ic->i_cm_id) {
1062 		rdsdebug("disconnecting cm %p\n", ic->i_cm_id);
1063 		err = rdma_disconnect(ic->i_cm_id);
1064 		if (err) {
1065 			/* Actually this may happen quite frequently, when
1066 			 * an outgoing connect raced with an incoming connect.
1067 			 */
1068 			rdsdebug("failed to disconnect, cm: %p err %d\n",
1069 				ic->i_cm_id, err);
1070 		}
1071 
1072 		/* kick off "flush_worker" for all pools in order to reap
1073 		 * all FRMR registrations that are still marked "FRMR_IS_INUSE"
1074 		 */
1075 		rds_ib_flush_mrs();
1076 
1077 		/*
1078 		 * We want to wait for tx and rx completion to finish
1079 		 * before we tear down the connection, but we have to be
1080 		 * careful not to get stuck waiting on a send ring that
1081 		 * only has unsignaled sends in it.  We've shutdown new
1082 		 * sends before getting here so by waiting for signaled
1083 		 * sends to complete we're ensured that there will be no
1084 		 * more tx processing.
1085 		 */
1086 		wait_event(rds_ib_ring_empty_wait,
1087 			   rds_ib_ring_empty(&ic->i_recv_ring) &&
1088 			   (atomic_read(&ic->i_signaled_sends) == 0) &&
1089 			   (atomic_read(&ic->i_fastreg_inuse_count) == 0) &&
1090 			   (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR));
1091 		tasklet_kill(&ic->i_send_tasklet);
1092 		tasklet_kill(&ic->i_recv_tasklet);
1093 
1094 		atomic_set(&ic->i_cq_quiesce, 1);
1095 
1096 		/* first destroy the ib state that generates callbacks */
1097 		if (ic->i_cm_id->qp)
1098 			rdma_destroy_qp(ic->i_cm_id);
1099 		if (ic->i_send_cq) {
1100 			if (ic->rds_ibdev)
1101 				ibdev_put_vector(ic->rds_ibdev, ic->i_scq_vector);
1102 			ib_destroy_cq(ic->i_send_cq);
1103 		}
1104 
1105 		if (ic->i_recv_cq) {
1106 			if (ic->rds_ibdev)
1107 				ibdev_put_vector(ic->rds_ibdev, ic->i_rcq_vector);
1108 			ib_destroy_cq(ic->i_recv_cq);
1109 		}
1110 
1111 		if (ic->rds_ibdev) {
1112 			struct dma_pool *pool;
1113 
1114 			pool = ic->rds_ibdev->rid_hdrs_pool;
1115 
1116 			/* then free the resources that ib callbacks use */
1117 			if (ic->i_send_hdrs) {
1118 				rds_dma_hdrs_free(pool, ic->i_send_hdrs,
1119 						  ic->i_send_hdrs_dma,
1120 						  ic->i_send_ring.w_nr);
1121 				ic->i_send_hdrs = NULL;
1122 				ic->i_send_hdrs_dma = NULL;
1123 			}
1124 
1125 			if (ic->i_recv_hdrs) {
1126 				rds_dma_hdrs_free(pool, ic->i_recv_hdrs,
1127 						  ic->i_recv_hdrs_dma,
1128 						  ic->i_recv_ring.w_nr);
1129 				ic->i_recv_hdrs = NULL;
1130 				ic->i_recv_hdrs_dma = NULL;
1131 			}
1132 
1133 			if (ic->i_ack) {
1134 				dma_pool_free(pool, ic->i_ack, ic->i_ack_dma);
1135 				ic->i_ack = NULL;
1136 			}
1137 		} else {
1138 			WARN_ON(ic->i_send_hdrs);
1139 			WARN_ON(ic->i_send_hdrs_dma);
1140 			WARN_ON(ic->i_recv_hdrs);
1141 			WARN_ON(ic->i_recv_hdrs_dma);
1142 			WARN_ON(ic->i_ack);
1143 		}
1144 
1145 		if (ic->i_sends)
1146 			rds_ib_send_clear_ring(ic);
1147 		if (ic->i_recvs)
1148 			rds_ib_recv_clear_ring(ic);
1149 
1150 		rdma_destroy_id(ic->i_cm_id);
1151 
1152 		/*
1153 		 * Move connection back to the nodev list.
1154 		 */
1155 		if (ic->rds_ibdev)
1156 			rds_ib_remove_conn(ic->rds_ibdev, conn);
1157 
1158 		ic->i_cm_id = NULL;
1159 		ic->i_pd = NULL;
1160 		ic->i_send_cq = NULL;
1161 		ic->i_recv_cq = NULL;
1162 	}
1163 	BUG_ON(ic->rds_ibdev);
1164 
1165 	/* Clear pending transmit */
1166 	if (ic->i_data_op) {
1167 		struct rds_message *rm;
1168 
1169 		rm = container_of(ic->i_data_op, struct rds_message, data);
1170 		rds_message_put(rm);
1171 		ic->i_data_op = NULL;
1172 	}
1173 
1174 	/* Clear the ACK state */
1175 	clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
1176 #ifdef KERNEL_HAS_ATOMIC64
1177 	atomic64_set(&ic->i_ack_next, 0);
1178 #else
1179 	ic->i_ack_next = 0;
1180 #endif
1181 	ic->i_ack_recv = 0;
1182 
1183 	/* Clear flow control state */
1184 	ic->i_flowctl = 0;
1185 	atomic_set(&ic->i_credits, 0);
1186 
1187 	/* Re-init rings, but retain sizes. */
1188 	rds_ib_ring_init(&ic->i_send_ring, ic->i_send_ring.w_nr);
1189 	rds_ib_ring_init(&ic->i_recv_ring, ic->i_recv_ring.w_nr);
1190 
1191 	if (ic->i_ibinc) {
1192 		rds_inc_put(&ic->i_ibinc->ii_inc);
1193 		ic->i_ibinc = NULL;
1194 	}
1195 
1196 	vfree(ic->i_sends);
1197 	ic->i_sends = NULL;
1198 	vfree(ic->i_recvs);
1199 	ic->i_recvs = NULL;
1200 	ic->i_active_side = false;
1201 }
1202 
1203 int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
1204 {
1205 	struct rds_ib_connection *ic;
1206 	unsigned long flags;
1207 	int ret;
1208 
1209 	/* XXX too lazy? */
1210 	ic = kzalloc(sizeof(struct rds_ib_connection), gfp);
1211 	if (!ic)
1212 		return -ENOMEM;
1213 
1214 	ret = rds_ib_recv_alloc_caches(ic, gfp);
1215 	if (ret) {
1216 		kfree(ic);
1217 		return ret;
1218 	}
1219 
1220 	INIT_LIST_HEAD(&ic->ib_node);
1221 	tasklet_init(&ic->i_send_tasklet, rds_ib_tasklet_fn_send,
1222 		     (unsigned long)ic);
1223 	tasklet_init(&ic->i_recv_tasklet, rds_ib_tasklet_fn_recv,
1224 		     (unsigned long)ic);
1225 	mutex_init(&ic->i_recv_mutex);
1226 #ifndef KERNEL_HAS_ATOMIC64
1227 	spin_lock_init(&ic->i_ack_lock);
1228 #endif
1229 	atomic_set(&ic->i_signaled_sends, 0);
1230 	atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR);
1231 
1232 	/*
1233 	 * rds_ib_conn_shutdown() waits for these to be emptied so they
1234 	 * must be initialized before it can be called.
1235 	 */
1236 	rds_ib_ring_init(&ic->i_send_ring, 0);
1237 	rds_ib_ring_init(&ic->i_recv_ring, 0);
1238 
1239 	ic->conn = conn;
1240 	conn->c_transport_data = ic;
1241 
1242 	spin_lock_irqsave(&ib_nodev_conns_lock, flags);
1243 	list_add_tail(&ic->ib_node, &ib_nodev_conns);
1244 	spin_unlock_irqrestore(&ib_nodev_conns_lock, flags);
1245 
1246 
1247 	rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data);
1248 	return 0;
1249 }
1250 
1251 /*
1252  * Free a connection. Connection must be shut down and not set for reconnect.
1253  */
1254 void rds_ib_conn_free(void *arg)
1255 {
1256 	struct rds_ib_connection *ic = arg;
1257 	spinlock_t	*lock_ptr;
1258 
1259 	rdsdebug("ic %p\n", ic);
1260 
1261 	/*
1262 	 * Conn is either on a dev's list or on the nodev list.
1263 	 * A race with shutdown() or connect() would cause problems
1264 	 * (since rds_ibdev would change) but that should never happen.
1265 	 */
1266 	lock_ptr = ic->rds_ibdev ? &ic->rds_ibdev->spinlock : &ib_nodev_conns_lock;
1267 
1268 	spin_lock_irq(lock_ptr);
1269 	list_del(&ic->ib_node);
1270 	spin_unlock_irq(lock_ptr);
1271 
1272 	rds_ib_recv_free_caches(ic);
1273 
1274 	kfree(ic);
1275 }
1276 
1277 
1278 /*
1279  * An error occurred on the connection
1280  */
1281 void
1282 __rds_ib_conn_error(struct rds_connection *conn, const char *fmt, ...)
1283 {
1284 	va_list ap;
1285 
1286 	rds_conn_drop(conn);
1287 
1288 	va_start(ap, fmt);
1289 	vprintk(fmt, ap);
1290 	va_end(ap);
1291 }
1292