1 /*
2  * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
3  * Copyright (c) 2005-2007 Network Appliance, Inc. All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the BSD-type
9  * license below:
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  *
15  *      Redistributions of source code must retain the above copyright
16  *      notice, this list of conditions and the following disclaimer.
17  *
18  *      Redistributions in binary form must reproduce the above
19  *      copyright notice, this list of conditions and the following
20  *      disclaimer in the documentation and/or other materials provided
21  *      with the distribution.
22  *
23  *      Neither the name of the Network Appliance, Inc. nor the names of
24  *      its contributors may be used to endorse or promote products
25  *      derived from this software without specific prior written
26  *      permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
29  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
31  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
32  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
33  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
34  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
35  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
36  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
37  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
38  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
39  *
40  * Author: Tom Tucker <tom@opengridcomputing.com>
41  */
42 
43 #include <linux/sunrpc/svc_xprt.h>
44 #include <linux/sunrpc/addr.h>
45 #include <linux/sunrpc/debug.h>
46 #include <linux/sunrpc/rpc_rdma.h>
47 #include <linux/interrupt.h>
48 #include <linux/sched.h>
49 #include <linux/slab.h>
50 #include <linux/spinlock.h>
51 #include <linux/workqueue.h>
52 #include <rdma/ib_verbs.h>
53 #include <rdma/rdma_cm.h>
54 #include <linux/sunrpc/svc_rdma.h>
55 #include <linux/export.h>
56 #include "xprt_rdma.h"
57 
58 #define RPCDBG_FACILITY	RPCDBG_SVCXPRT
59 
60 static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *, int);
61 static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
62 					struct net *net,
63 					struct sockaddr *sa, int salen,
64 					int flags);
65 static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt);
66 static void svc_rdma_release_rqst(struct svc_rqst *);
67 static void svc_rdma_detach(struct svc_xprt *xprt);
68 static void svc_rdma_free(struct svc_xprt *xprt);
69 static int svc_rdma_has_wspace(struct svc_xprt *xprt);
70 static int svc_rdma_secure_port(struct svc_rqst *);
71 static void svc_rdma_kill_temp_xprt(struct svc_xprt *);
72 
73 static struct svc_xprt_ops svc_rdma_ops = {
74 	.xpo_create = svc_rdma_create,
75 	.xpo_recvfrom = svc_rdma_recvfrom,
76 	.xpo_sendto = svc_rdma_sendto,
77 	.xpo_release_rqst = svc_rdma_release_rqst,
78 	.xpo_detach = svc_rdma_detach,
79 	.xpo_free = svc_rdma_free,
80 	.xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr,
81 	.xpo_has_wspace = svc_rdma_has_wspace,
82 	.xpo_accept = svc_rdma_accept,
83 	.xpo_secure_port = svc_rdma_secure_port,
84 	.xpo_kill_temp_xprt = svc_rdma_kill_temp_xprt,
85 };
86 
87 struct svc_xprt_class svc_rdma_class = {
88 	.xcl_name = "rdma",
89 	.xcl_owner = THIS_MODULE,
90 	.xcl_ops = &svc_rdma_ops,
91 	.xcl_max_payload = RPCSVC_MAXPAYLOAD_RDMA,
92 	.xcl_ident = XPRT_TRANSPORT_RDMA,
93 };
94 
95 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
96 static struct svc_xprt *svc_rdma_bc_create(struct svc_serv *, struct net *,
97 					   struct sockaddr *, int, int);
98 static void svc_rdma_bc_detach(struct svc_xprt *);
99 static void svc_rdma_bc_free(struct svc_xprt *);
100 
101 static struct svc_xprt_ops svc_rdma_bc_ops = {
102 	.xpo_create = svc_rdma_bc_create,
103 	.xpo_detach = svc_rdma_bc_detach,
104 	.xpo_free = svc_rdma_bc_free,
105 	.xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr,
106 	.xpo_secure_port = svc_rdma_secure_port,
107 };
108 
109 struct svc_xprt_class svc_rdma_bc_class = {
110 	.xcl_name = "rdma-bc",
111 	.xcl_owner = THIS_MODULE,
112 	.xcl_ops = &svc_rdma_bc_ops,
113 	.xcl_max_payload = (1024 - RPCRDMA_HDRLEN_MIN)
114 };
115 
116 static struct svc_xprt *svc_rdma_bc_create(struct svc_serv *serv,
117 					   struct net *net,
118 					   struct sockaddr *sa, int salen,
119 					   int flags)
120 {
121 	struct svcxprt_rdma *cma_xprt;
122 	struct svc_xprt *xprt;
123 
124 	cma_xprt = rdma_create_xprt(serv, 0);
125 	if (!cma_xprt)
126 		return ERR_PTR(-ENOMEM);
127 	xprt = &cma_xprt->sc_xprt;
128 
129 	svc_xprt_init(net, &svc_rdma_bc_class, xprt, serv);
130 	set_bit(XPT_CONG_CTRL, &xprt->xpt_flags);
131 	serv->sv_bc_xprt = xprt;
132 
133 	dprintk("svcrdma: %s(%p)\n", __func__, xprt);
134 	return xprt;
135 }
136 
137 static void svc_rdma_bc_detach(struct svc_xprt *xprt)
138 {
139 	dprintk("svcrdma: %s(%p)\n", __func__, xprt);
140 }
141 
142 static void svc_rdma_bc_free(struct svc_xprt *xprt)
143 {
144 	struct svcxprt_rdma *rdma =
145 		container_of(xprt, struct svcxprt_rdma, sc_xprt);
146 
147 	dprintk("svcrdma: %s(%p)\n", __func__, xprt);
148 	if (xprt)
149 		kfree(rdma);
150 }
151 #endif	/* CONFIG_SUNRPC_BACKCHANNEL */
152 
153 static struct svc_rdma_op_ctxt *alloc_ctxt(struct svcxprt_rdma *xprt,
154 					   gfp_t flags)
155 {
156 	struct svc_rdma_op_ctxt *ctxt;
157 
158 	ctxt = kmalloc(sizeof(*ctxt), flags);
159 	if (ctxt) {
160 		ctxt->xprt = xprt;
161 		INIT_LIST_HEAD(&ctxt->list);
162 	}
163 	return ctxt;
164 }
165 
166 static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt)
167 {
168 	unsigned int i;
169 
170 	/* Each RPC/RDMA credit can consume a number of send
171 	 * and receive WQEs. One ctxt is allocated for each.
172 	 */
173 	i = xprt->sc_sq_depth + xprt->sc_rq_depth;
174 
175 	while (i--) {
176 		struct svc_rdma_op_ctxt *ctxt;
177 
178 		ctxt = alloc_ctxt(xprt, GFP_KERNEL);
179 		if (!ctxt) {
180 			dprintk("svcrdma: No memory for RDMA ctxt\n");
181 			return false;
182 		}
183 		list_add(&ctxt->list, &xprt->sc_ctxts);
184 	}
185 	return true;
186 }
187 
188 struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
189 {
190 	struct svc_rdma_op_ctxt *ctxt = NULL;
191 
192 	spin_lock(&xprt->sc_ctxt_lock);
193 	xprt->sc_ctxt_used++;
194 	if (list_empty(&xprt->sc_ctxts))
195 		goto out_empty;
196 
197 	ctxt = list_first_entry(&xprt->sc_ctxts,
198 				struct svc_rdma_op_ctxt, list);
199 	list_del(&ctxt->list);
200 	spin_unlock(&xprt->sc_ctxt_lock);
201 
202 out:
203 	ctxt->count = 0;
204 	ctxt->mapped_sges = 0;
205 	return ctxt;
206 
207 out_empty:
208 	/* Either pre-allocation missed the mark, or send
209 	 * queue accounting is broken.
210 	 */
211 	spin_unlock(&xprt->sc_ctxt_lock);
212 
213 	ctxt = alloc_ctxt(xprt, GFP_NOIO);
214 	if (ctxt)
215 		goto out;
216 
217 	spin_lock(&xprt->sc_ctxt_lock);
218 	xprt->sc_ctxt_used--;
219 	spin_unlock(&xprt->sc_ctxt_lock);
220 	WARN_ONCE(1, "svcrdma: empty RDMA ctxt list?\n");
221 	return NULL;
222 }
223 
224 void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
225 {
226 	struct svcxprt_rdma *xprt = ctxt->xprt;
227 	struct ib_device *device = xprt->sc_cm_id->device;
228 	unsigned int i;
229 
230 	for (i = 0; i < ctxt->mapped_sges; i++)
231 		ib_dma_unmap_page(device,
232 				  ctxt->sge[i].addr,
233 				  ctxt->sge[i].length,
234 				  ctxt->direction);
235 	ctxt->mapped_sges = 0;
236 }
237 
238 void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
239 {
240 	struct svcxprt_rdma *xprt = ctxt->xprt;
241 	int i;
242 
243 	if (free_pages)
244 		for (i = 0; i < ctxt->count; i++)
245 			put_page(ctxt->pages[i]);
246 
247 	spin_lock(&xprt->sc_ctxt_lock);
248 	xprt->sc_ctxt_used--;
249 	list_add(&ctxt->list, &xprt->sc_ctxts);
250 	spin_unlock(&xprt->sc_ctxt_lock);
251 }
252 
253 static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
254 {
255 	while (!list_empty(&xprt->sc_ctxts)) {
256 		struct svc_rdma_op_ctxt *ctxt;
257 
258 		ctxt = list_first_entry(&xprt->sc_ctxts,
259 					struct svc_rdma_op_ctxt, list);
260 		list_del(&ctxt->list);
261 		kfree(ctxt);
262 	}
263 }
264 
265 /* QP event handler */
266 static void qp_event_handler(struct ib_event *event, void *context)
267 {
268 	struct svc_xprt *xprt = context;
269 
270 	switch (event->event) {
271 	/* These are considered benign events */
272 	case IB_EVENT_PATH_MIG:
273 	case IB_EVENT_COMM_EST:
274 	case IB_EVENT_SQ_DRAINED:
275 	case IB_EVENT_QP_LAST_WQE_REACHED:
276 		dprintk("svcrdma: QP event %s (%d) received for QP=%p\n",
277 			ib_event_msg(event->event), event->event,
278 			event->element.qp);
279 		break;
280 	/* These are considered fatal events */
281 	case IB_EVENT_PATH_MIG_ERR:
282 	case IB_EVENT_QP_FATAL:
283 	case IB_EVENT_QP_REQ_ERR:
284 	case IB_EVENT_QP_ACCESS_ERR:
285 	case IB_EVENT_DEVICE_FATAL:
286 	default:
287 		dprintk("svcrdma: QP ERROR event %s (%d) received for QP=%p, "
288 			"closing transport\n",
289 			ib_event_msg(event->event), event->event,
290 			event->element.qp);
291 		set_bit(XPT_CLOSE, &xprt->xpt_flags);
292 		break;
293 	}
294 }
295 
296 /**
297  * svc_rdma_wc_receive - Invoked by RDMA provider for each polled Receive WC
298  * @cq:        completion queue
299  * @wc:        completed WR
300  *
301  */
302 static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
303 {
304 	struct svcxprt_rdma *xprt = cq->cq_context;
305 	struct ib_cqe *cqe = wc->wr_cqe;
306 	struct svc_rdma_op_ctxt *ctxt;
307 
308 	/* WARNING: Only wc->wr_cqe and wc->status are reliable */
309 	ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe);
310 	svc_rdma_unmap_dma(ctxt);
311 
312 	if (wc->status != IB_WC_SUCCESS)
313 		goto flushed;
314 
315 	/* All wc fields are now known to be valid */
316 	ctxt->byte_len = wc->byte_len;
317 	spin_lock(&xprt->sc_rq_dto_lock);
318 	list_add_tail(&ctxt->list, &xprt->sc_rq_dto_q);
319 	spin_unlock(&xprt->sc_rq_dto_lock);
320 
321 	set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
322 	if (test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags))
323 		goto out;
324 	svc_xprt_enqueue(&xprt->sc_xprt);
325 	goto out;
326 
327 flushed:
328 	if (wc->status != IB_WC_WR_FLUSH_ERR)
329 		pr_warn("svcrdma: receive: %s (%u/0x%x)\n",
330 			ib_wc_status_msg(wc->status),
331 			wc->status, wc->vendor_err);
332 	set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
333 	svc_rdma_put_context(ctxt, 1);
334 
335 out:
336 	svc_xprt_put(&xprt->sc_xprt);
337 }
338 
339 /**
340  * svc_rdma_wc_send - Invoked by RDMA provider for each polled Send WC
341  * @cq:        completion queue
342  * @wc:        completed WR
343  *
344  */
345 void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
346 {
347 	struct svcxprt_rdma *xprt = cq->cq_context;
348 	struct ib_cqe *cqe = wc->wr_cqe;
349 	struct svc_rdma_op_ctxt *ctxt;
350 
351 	atomic_inc(&xprt->sc_sq_avail);
352 	wake_up(&xprt->sc_send_wait);
353 
354 	ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe);
355 	svc_rdma_unmap_dma(ctxt);
356 	svc_rdma_put_context(ctxt, 1);
357 
358 	if (unlikely(wc->status != IB_WC_SUCCESS)) {
359 		set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
360 		if (wc->status != IB_WC_WR_FLUSH_ERR)
361 			pr_err("svcrdma: Send: %s (%u/0x%x)\n",
362 			       ib_wc_status_msg(wc->status),
363 			       wc->status, wc->vendor_err);
364 	}
365 
366 	svc_xprt_put(&xprt->sc_xprt);
367 }
368 
369 static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
370 					     int listener)
371 {
372 	struct svcxprt_rdma *cma_xprt = kzalloc(sizeof *cma_xprt, GFP_KERNEL);
373 
374 	if (!cma_xprt)
375 		return NULL;
376 	svc_xprt_init(&init_net, &svc_rdma_class, &cma_xprt->sc_xprt, serv);
377 	INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
378 	INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
379 	INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
380 	INIT_LIST_HEAD(&cma_xprt->sc_ctxts);
381 	INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts);
382 	init_waitqueue_head(&cma_xprt->sc_send_wait);
383 
384 	spin_lock_init(&cma_xprt->sc_lock);
385 	spin_lock_init(&cma_xprt->sc_rq_dto_lock);
386 	spin_lock_init(&cma_xprt->sc_ctxt_lock);
387 	spin_lock_init(&cma_xprt->sc_rw_ctxt_lock);
388 
389 	/*
390 	 * Note that this implies that the underlying transport support
391 	 * has some form of congestion control (see RFC 7530 section 3.1
392 	 * paragraph 2). For now, we assume that all supported RDMA
393 	 * transports are suitable here.
394 	 */
395 	set_bit(XPT_CONG_CTRL, &cma_xprt->sc_xprt.xpt_flags);
396 
397 	if (listener)
398 		set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
399 
400 	return cma_xprt;
401 }
402 
403 int svc_rdma_post_recv(struct svcxprt_rdma *xprt, gfp_t flags)
404 {
405 	struct ib_recv_wr recv_wr, *bad_recv_wr;
406 	struct svc_rdma_op_ctxt *ctxt;
407 	struct page *page;
408 	dma_addr_t pa;
409 	int sge_no;
410 	int buflen;
411 	int ret;
412 
413 	ctxt = svc_rdma_get_context(xprt);
414 	buflen = 0;
415 	ctxt->direction = DMA_FROM_DEVICE;
416 	ctxt->cqe.done = svc_rdma_wc_receive;
417 	for (sge_no = 0; buflen < xprt->sc_max_req_size; sge_no++) {
418 		if (sge_no >= xprt->sc_max_sge) {
419 			pr_err("svcrdma: Too many sges (%d)\n", sge_no);
420 			goto err_put_ctxt;
421 		}
422 		page = alloc_page(flags);
423 		if (!page)
424 			goto err_put_ctxt;
425 		ctxt->pages[sge_no] = page;
426 		pa = ib_dma_map_page(xprt->sc_cm_id->device,
427 				     page, 0, PAGE_SIZE,
428 				     DMA_FROM_DEVICE);
429 		if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa))
430 			goto err_put_ctxt;
431 		svc_rdma_count_mappings(xprt, ctxt);
432 		ctxt->sge[sge_no].addr = pa;
433 		ctxt->sge[sge_no].length = PAGE_SIZE;
434 		ctxt->sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey;
435 		ctxt->count = sge_no + 1;
436 		buflen += PAGE_SIZE;
437 	}
438 	recv_wr.next = NULL;
439 	recv_wr.sg_list = &ctxt->sge[0];
440 	recv_wr.num_sge = ctxt->count;
441 	recv_wr.wr_cqe = &ctxt->cqe;
442 
443 	svc_xprt_get(&xprt->sc_xprt);
444 	ret = ib_post_recv(xprt->sc_qp, &recv_wr, &bad_recv_wr);
445 	if (ret) {
446 		svc_rdma_unmap_dma(ctxt);
447 		svc_rdma_put_context(ctxt, 1);
448 		svc_xprt_put(&xprt->sc_xprt);
449 	}
450 	return ret;
451 
452  err_put_ctxt:
453 	svc_rdma_unmap_dma(ctxt);
454 	svc_rdma_put_context(ctxt, 1);
455 	return -ENOMEM;
456 }
457 
458 int svc_rdma_repost_recv(struct svcxprt_rdma *xprt, gfp_t flags)
459 {
460 	int ret = 0;
461 
462 	ret = svc_rdma_post_recv(xprt, flags);
463 	if (ret) {
464 		pr_err("svcrdma: could not post a receive buffer, err=%d.\n",
465 		       ret);
466 		pr_err("svcrdma: closing transport %p.\n", xprt);
467 		set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
468 		ret = -ENOTCONN;
469 	}
470 	return ret;
471 }
472 
473 static void
474 svc_rdma_parse_connect_private(struct svcxprt_rdma *newxprt,
475 			       struct rdma_conn_param *param)
476 {
477 	const struct rpcrdma_connect_private *pmsg = param->private_data;
478 
479 	if (pmsg &&
480 	    pmsg->cp_magic == rpcrdma_cmp_magic &&
481 	    pmsg->cp_version == RPCRDMA_CMP_VERSION) {
482 		newxprt->sc_snd_w_inv = pmsg->cp_flags &
483 					RPCRDMA_CMP_F_SND_W_INV_OK;
484 
485 		dprintk("svcrdma: client send_size %u, recv_size %u "
486 			"remote inv %ssupported\n",
487 			rpcrdma_decode_buffer_size(pmsg->cp_send_size),
488 			rpcrdma_decode_buffer_size(pmsg->cp_recv_size),
489 			newxprt->sc_snd_w_inv ? "" : "un");
490 	}
491 }
492 
493 /*
494  * This function handles the CONNECT_REQUEST event on a listening
495  * endpoint. It is passed the cma_id for the _new_ connection. The context in
496  * this cma_id is inherited from the listening cma_id and is the svc_xprt
497  * structure for the listening endpoint.
498  *
499  * This function creates a new xprt for the new connection and enqueues it on
500  * the accept queue for the listent xprt. When the listen thread is kicked, it
501  * will call the recvfrom method on the listen xprt which will accept the new
502  * connection.
503  */
504 static void handle_connect_req(struct rdma_cm_id *new_cma_id,
505 			       struct rdma_conn_param *param)
506 {
507 	struct svcxprt_rdma *listen_xprt = new_cma_id->context;
508 	struct svcxprt_rdma *newxprt;
509 	struct sockaddr *sa;
510 
511 	/* Create a new transport */
512 	newxprt = rdma_create_xprt(listen_xprt->sc_xprt.xpt_server, 0);
513 	if (!newxprt) {
514 		dprintk("svcrdma: failed to create new transport\n");
515 		return;
516 	}
517 	newxprt->sc_cm_id = new_cma_id;
518 	new_cma_id->context = newxprt;
519 	dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n",
520 		newxprt, newxprt->sc_cm_id, listen_xprt);
521 	svc_rdma_parse_connect_private(newxprt, param);
522 
523 	/* Save client advertised inbound read limit for use later in accept. */
524 	newxprt->sc_ord = param->initiator_depth;
525 
526 	/* Set the local and remote addresses in the transport */
527 	sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
528 	svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa));
529 	sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr;
530 	svc_xprt_set_local(&newxprt->sc_xprt, sa, svc_addr_len(sa));
531 
532 	/*
533 	 * Enqueue the new transport on the accept queue of the listening
534 	 * transport
535 	 */
536 	spin_lock_bh(&listen_xprt->sc_lock);
537 	list_add_tail(&newxprt->sc_accept_q, &listen_xprt->sc_accept_q);
538 	spin_unlock_bh(&listen_xprt->sc_lock);
539 
540 	set_bit(XPT_CONN, &listen_xprt->sc_xprt.xpt_flags);
541 	svc_xprt_enqueue(&listen_xprt->sc_xprt);
542 }
543 
544 /*
545  * Handles events generated on the listening endpoint. These events will be
546  * either be incoming connect requests or adapter removal  events.
547  */
548 static int rdma_listen_handler(struct rdma_cm_id *cma_id,
549 			       struct rdma_cm_event *event)
550 {
551 	struct svcxprt_rdma *xprt = cma_id->context;
552 	int ret = 0;
553 
554 	switch (event->event) {
555 	case RDMA_CM_EVENT_CONNECT_REQUEST:
556 		dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, "
557 			"event = %s (%d)\n", cma_id, cma_id->context,
558 			rdma_event_msg(event->event), event->event);
559 		handle_connect_req(cma_id, &event->param.conn);
560 		break;
561 
562 	case RDMA_CM_EVENT_ESTABLISHED:
563 		/* Accept complete */
564 		dprintk("svcrdma: Connection completed on LISTEN xprt=%p, "
565 			"cm_id=%p\n", xprt, cma_id);
566 		break;
567 
568 	case RDMA_CM_EVENT_DEVICE_REMOVAL:
569 		dprintk("svcrdma: Device removal xprt=%p, cm_id=%p\n",
570 			xprt, cma_id);
571 		if (xprt)
572 			set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
573 		break;
574 
575 	default:
576 		dprintk("svcrdma: Unexpected event on listening endpoint %p, "
577 			"event = %s (%d)\n", cma_id,
578 			rdma_event_msg(event->event), event->event);
579 		break;
580 	}
581 
582 	return ret;
583 }
584 
585 static int rdma_cma_handler(struct rdma_cm_id *cma_id,
586 			    struct rdma_cm_event *event)
587 {
588 	struct svc_xprt *xprt = cma_id->context;
589 	struct svcxprt_rdma *rdma =
590 		container_of(xprt, struct svcxprt_rdma, sc_xprt);
591 	switch (event->event) {
592 	case RDMA_CM_EVENT_ESTABLISHED:
593 		/* Accept complete */
594 		svc_xprt_get(xprt);
595 		dprintk("svcrdma: Connection completed on DTO xprt=%p, "
596 			"cm_id=%p\n", xprt, cma_id);
597 		clear_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags);
598 		svc_xprt_enqueue(xprt);
599 		break;
600 	case RDMA_CM_EVENT_DISCONNECTED:
601 		dprintk("svcrdma: Disconnect on DTO xprt=%p, cm_id=%p\n",
602 			xprt, cma_id);
603 		if (xprt) {
604 			set_bit(XPT_CLOSE, &xprt->xpt_flags);
605 			svc_xprt_enqueue(xprt);
606 			svc_xprt_put(xprt);
607 		}
608 		break;
609 	case RDMA_CM_EVENT_DEVICE_REMOVAL:
610 		dprintk("svcrdma: Device removal cma_id=%p, xprt = %p, "
611 			"event = %s (%d)\n", cma_id, xprt,
612 			rdma_event_msg(event->event), event->event);
613 		if (xprt) {
614 			set_bit(XPT_CLOSE, &xprt->xpt_flags);
615 			svc_xprt_enqueue(xprt);
616 			svc_xprt_put(xprt);
617 		}
618 		break;
619 	default:
620 		dprintk("svcrdma: Unexpected event on DTO endpoint %p, "
621 			"event = %s (%d)\n", cma_id,
622 			rdma_event_msg(event->event), event->event);
623 		break;
624 	}
625 	return 0;
626 }
627 
628 /*
629  * Create a listening RDMA service endpoint.
630  */
631 static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
632 					struct net *net,
633 					struct sockaddr *sa, int salen,
634 					int flags)
635 {
636 	struct rdma_cm_id *listen_id;
637 	struct svcxprt_rdma *cma_xprt;
638 	int ret;
639 
640 	dprintk("svcrdma: Creating RDMA socket\n");
641 	if ((sa->sa_family != AF_INET) && (sa->sa_family != AF_INET6)) {
642 		dprintk("svcrdma: Address family %d is not supported.\n", sa->sa_family);
643 		return ERR_PTR(-EAFNOSUPPORT);
644 	}
645 	cma_xprt = rdma_create_xprt(serv, 1);
646 	if (!cma_xprt)
647 		return ERR_PTR(-ENOMEM);
648 
649 	listen_id = rdma_create_id(&init_net, rdma_listen_handler, cma_xprt,
650 				   RDMA_PS_TCP, IB_QPT_RC);
651 	if (IS_ERR(listen_id)) {
652 		ret = PTR_ERR(listen_id);
653 		dprintk("svcrdma: rdma_create_id failed = %d\n", ret);
654 		goto err0;
655 	}
656 
657 	/* Allow both IPv4 and IPv6 sockets to bind a single port
658 	 * at the same time.
659 	 */
660 #if IS_ENABLED(CONFIG_IPV6)
661 	ret = rdma_set_afonly(listen_id, 1);
662 	if (ret) {
663 		dprintk("svcrdma: rdma_set_afonly failed = %d\n", ret);
664 		goto err1;
665 	}
666 #endif
667 	ret = rdma_bind_addr(listen_id, sa);
668 	if (ret) {
669 		dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret);
670 		goto err1;
671 	}
672 	cma_xprt->sc_cm_id = listen_id;
673 
674 	ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG);
675 	if (ret) {
676 		dprintk("svcrdma: rdma_listen failed = %d\n", ret);
677 		goto err1;
678 	}
679 
680 	/*
681 	 * We need to use the address from the cm_id in case the
682 	 * caller specified 0 for the port number.
683 	 */
684 	sa = (struct sockaddr *)&cma_xprt->sc_cm_id->route.addr.src_addr;
685 	svc_xprt_set_local(&cma_xprt->sc_xprt, sa, salen);
686 
687 	return &cma_xprt->sc_xprt;
688 
689  err1:
690 	rdma_destroy_id(listen_id);
691  err0:
692 	kfree(cma_xprt);
693 	return ERR_PTR(ret);
694 }
695 
696 /*
697  * This is the xpo_recvfrom function for listening endpoints. Its
698  * purpose is to accept incoming connections. The CMA callback handler
699  * has already created a new transport and attached it to the new CMA
700  * ID.
701  *
702  * There is a queue of pending connections hung on the listening
703  * transport. This queue contains the new svc_xprt structure. This
704  * function takes svc_xprt structures off the accept_q and completes
705  * the connection.
706  */
707 static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
708 {
709 	struct svcxprt_rdma *listen_rdma;
710 	struct svcxprt_rdma *newxprt = NULL;
711 	struct rdma_conn_param conn_param;
712 	struct rpcrdma_connect_private pmsg;
713 	struct ib_qp_init_attr qp_attr;
714 	struct ib_device *dev;
715 	struct sockaddr *sap;
716 	unsigned int i;
717 	int ret = 0;
718 
719 	listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt);
720 	clear_bit(XPT_CONN, &xprt->xpt_flags);
721 	/* Get the next entry off the accept list */
722 	spin_lock_bh(&listen_rdma->sc_lock);
723 	if (!list_empty(&listen_rdma->sc_accept_q)) {
724 		newxprt = list_entry(listen_rdma->sc_accept_q.next,
725 				     struct svcxprt_rdma, sc_accept_q);
726 		list_del_init(&newxprt->sc_accept_q);
727 	}
728 	if (!list_empty(&listen_rdma->sc_accept_q))
729 		set_bit(XPT_CONN, &listen_rdma->sc_xprt.xpt_flags);
730 	spin_unlock_bh(&listen_rdma->sc_lock);
731 	if (!newxprt)
732 		return NULL;
733 
734 	dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n",
735 		newxprt, newxprt->sc_cm_id);
736 
737 	dev = newxprt->sc_cm_id->device;
738 	newxprt->sc_port_num = newxprt->sc_cm_id->port_num;
739 
740 	/* Qualify the transport resource defaults with the
741 	 * capabilities of this particular device */
742 	newxprt->sc_max_sge = min((size_t)dev->attrs.max_sge,
743 				  (size_t)RPCSVC_MAXPAGES);
744 	newxprt->sc_max_req_size = svcrdma_max_req_size;
745 	newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr,
746 					 svcrdma_max_requests);
747 	newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests);
748 	newxprt->sc_max_bc_requests = min_t(u32, dev->attrs.max_qp_wr,
749 					    svcrdma_max_bc_requests);
750 	newxprt->sc_rq_depth = newxprt->sc_max_requests +
751 			       newxprt->sc_max_bc_requests;
752 	newxprt->sc_sq_depth = newxprt->sc_rq_depth;
753 	atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth);
754 
755 	if (!svc_rdma_prealloc_ctxts(newxprt))
756 		goto errout;
757 
758 	/*
759 	 * Limit ORD based on client limit, local device limit, and
760 	 * configured svcrdma limit.
761 	 */
762 	newxprt->sc_ord = min_t(size_t, dev->attrs.max_qp_rd_atom, newxprt->sc_ord);
763 	newxprt->sc_ord = min_t(size_t,	svcrdma_ord, newxprt->sc_ord);
764 
765 	newxprt->sc_pd = ib_alloc_pd(dev, 0);
766 	if (IS_ERR(newxprt->sc_pd)) {
767 		dprintk("svcrdma: error creating PD for connect request\n");
768 		goto errout;
769 	}
770 	newxprt->sc_sq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_sq_depth,
771 					0, IB_POLL_WORKQUEUE);
772 	if (IS_ERR(newxprt->sc_sq_cq)) {
773 		dprintk("svcrdma: error creating SQ CQ for connect request\n");
774 		goto errout;
775 	}
776 	newxprt->sc_rq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_rq_depth,
777 					0, IB_POLL_WORKQUEUE);
778 	if (IS_ERR(newxprt->sc_rq_cq)) {
779 		dprintk("svcrdma: error creating RQ CQ for connect request\n");
780 		goto errout;
781 	}
782 
783 	memset(&qp_attr, 0, sizeof qp_attr);
784 	qp_attr.event_handler = qp_event_handler;
785 	qp_attr.qp_context = &newxprt->sc_xprt;
786 	qp_attr.port_num = newxprt->sc_port_num;
787 	qp_attr.cap.max_rdma_ctxs = newxprt->sc_max_requests;
788 	qp_attr.cap.max_send_wr = newxprt->sc_sq_depth;
789 	qp_attr.cap.max_recv_wr = newxprt->sc_rq_depth;
790 	qp_attr.cap.max_send_sge = newxprt->sc_max_sge;
791 	qp_attr.cap.max_recv_sge = newxprt->sc_max_sge;
792 	qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
793 	qp_attr.qp_type = IB_QPT_RC;
794 	qp_attr.send_cq = newxprt->sc_sq_cq;
795 	qp_attr.recv_cq = newxprt->sc_rq_cq;
796 	dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n",
797 		newxprt->sc_cm_id, newxprt->sc_pd);
798 	dprintk("    cap.max_send_wr = %d, cap.max_recv_wr = %d\n",
799 		qp_attr.cap.max_send_wr, qp_attr.cap.max_recv_wr);
800 	dprintk("    cap.max_send_sge = %d, cap.max_recv_sge = %d\n",
801 		qp_attr.cap.max_send_sge, qp_attr.cap.max_recv_sge);
802 
803 	ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr);
804 	if (ret) {
805 		dprintk("svcrdma: failed to create QP, ret=%d\n", ret);
806 		goto errout;
807 	}
808 	newxprt->sc_qp = newxprt->sc_cm_id->qp;
809 
810 	if (!(dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
811 		newxprt->sc_snd_w_inv = false;
812 	if (!rdma_protocol_iwarp(dev, newxprt->sc_port_num) &&
813 	    !rdma_ib_or_roce(dev, newxprt->sc_port_num))
814 		goto errout;
815 
816 	/* Post receive buffers */
817 	for (i = 0; i < newxprt->sc_max_requests; i++) {
818 		ret = svc_rdma_post_recv(newxprt, GFP_KERNEL);
819 		if (ret) {
820 			dprintk("svcrdma: failure posting receive buffers\n");
821 			goto errout;
822 		}
823 	}
824 
825 	/* Swap out the handler */
826 	newxprt->sc_cm_id->event_handler = rdma_cma_handler;
827 
828 	/* Construct RDMA-CM private message */
829 	pmsg.cp_magic = rpcrdma_cmp_magic;
830 	pmsg.cp_version = RPCRDMA_CMP_VERSION;
831 	pmsg.cp_flags = 0;
832 	pmsg.cp_send_size = pmsg.cp_recv_size =
833 		rpcrdma_encode_buffer_size(newxprt->sc_max_req_size);
834 
835 	/* Accept Connection */
836 	set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags);
837 	memset(&conn_param, 0, sizeof conn_param);
838 	conn_param.responder_resources = 0;
839 	conn_param.initiator_depth = newxprt->sc_ord;
840 	conn_param.private_data = &pmsg;
841 	conn_param.private_data_len = sizeof(pmsg);
842 	ret = rdma_accept(newxprt->sc_cm_id, &conn_param);
843 	if (ret) {
844 		dprintk("svcrdma: failed to accept new connection, ret=%d\n",
845 		       ret);
846 		goto errout;
847 	}
848 
849 	dprintk("svcrdma: new connection %p accepted:\n", newxprt);
850 	sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr;
851 	dprintk("    local address   : %pIS:%u\n", sap, rpc_get_port(sap));
852 	sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
853 	dprintk("    remote address  : %pIS:%u\n", sap, rpc_get_port(sap));
854 	dprintk("    max_sge         : %d\n", newxprt->sc_max_sge);
855 	dprintk("    sq_depth        : %d\n", newxprt->sc_sq_depth);
856 	dprintk("    max_requests    : %d\n", newxprt->sc_max_requests);
857 	dprintk("    ord             : %d\n", newxprt->sc_ord);
858 
859 	return &newxprt->sc_xprt;
860 
861  errout:
862 	dprintk("svcrdma: failure accepting new connection rc=%d.\n", ret);
863 	/* Take a reference in case the DTO handler runs */
864 	svc_xprt_get(&newxprt->sc_xprt);
865 	if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp))
866 		ib_destroy_qp(newxprt->sc_qp);
867 	rdma_destroy_id(newxprt->sc_cm_id);
868 	/* This call to put will destroy the transport */
869 	svc_xprt_put(&newxprt->sc_xprt);
870 	return NULL;
871 }
872 
873 static void svc_rdma_release_rqst(struct svc_rqst *rqstp)
874 {
875 }
876 
877 /*
878  * When connected, an svc_xprt has at least two references:
879  *
880  * - A reference held by the cm_id between the ESTABLISHED and
881  *   DISCONNECTED events. If the remote peer disconnected first, this
882  *   reference could be gone.
883  *
884  * - A reference held by the svc_recv code that called this function
885  *   as part of close processing.
886  *
887  * At a minimum one references should still be held.
888  */
889 static void svc_rdma_detach(struct svc_xprt *xprt)
890 {
891 	struct svcxprt_rdma *rdma =
892 		container_of(xprt, struct svcxprt_rdma, sc_xprt);
893 	dprintk("svc: svc_rdma_detach(%p)\n", xprt);
894 
895 	/* Disconnect and flush posted WQE */
896 	rdma_disconnect(rdma->sc_cm_id);
897 }
898 
899 static void __svc_rdma_free(struct work_struct *work)
900 {
901 	struct svcxprt_rdma *rdma =
902 		container_of(work, struct svcxprt_rdma, sc_work);
903 	struct svc_xprt *xprt = &rdma->sc_xprt;
904 
905 	dprintk("svcrdma: %s(%p)\n", __func__, rdma);
906 
907 	if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
908 		ib_drain_qp(rdma->sc_qp);
909 
910 	/* We should only be called from kref_put */
911 	if (kref_read(&xprt->xpt_ref) != 0)
912 		pr_err("svcrdma: sc_xprt still in use? (%d)\n",
913 		       kref_read(&xprt->xpt_ref));
914 
915 	while (!list_empty(&rdma->sc_read_complete_q)) {
916 		struct svc_rdma_op_ctxt *ctxt;
917 		ctxt = list_first_entry(&rdma->sc_read_complete_q,
918 					struct svc_rdma_op_ctxt, list);
919 		list_del(&ctxt->list);
920 		svc_rdma_put_context(ctxt, 1);
921 	}
922 	while (!list_empty(&rdma->sc_rq_dto_q)) {
923 		struct svc_rdma_op_ctxt *ctxt;
924 		ctxt = list_first_entry(&rdma->sc_rq_dto_q,
925 					struct svc_rdma_op_ctxt, list);
926 		list_del(&ctxt->list);
927 		svc_rdma_put_context(ctxt, 1);
928 	}
929 
930 	/* Warn if we leaked a resource or under-referenced */
931 	if (rdma->sc_ctxt_used != 0)
932 		pr_err("svcrdma: ctxt still in use? (%d)\n",
933 		       rdma->sc_ctxt_used);
934 
935 	/* Final put of backchannel client transport */
936 	if (xprt->xpt_bc_xprt) {
937 		xprt_put(xprt->xpt_bc_xprt);
938 		xprt->xpt_bc_xprt = NULL;
939 	}
940 
941 	svc_rdma_destroy_rw_ctxts(rdma);
942 	svc_rdma_destroy_ctxts(rdma);
943 
944 	/* Destroy the QP if present (not a listener) */
945 	if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
946 		ib_destroy_qp(rdma->sc_qp);
947 
948 	if (rdma->sc_sq_cq && !IS_ERR(rdma->sc_sq_cq))
949 		ib_free_cq(rdma->sc_sq_cq);
950 
951 	if (rdma->sc_rq_cq && !IS_ERR(rdma->sc_rq_cq))
952 		ib_free_cq(rdma->sc_rq_cq);
953 
954 	if (rdma->sc_pd && !IS_ERR(rdma->sc_pd))
955 		ib_dealloc_pd(rdma->sc_pd);
956 
957 	/* Destroy the CM ID */
958 	rdma_destroy_id(rdma->sc_cm_id);
959 
960 	kfree(rdma);
961 }
962 
963 static void svc_rdma_free(struct svc_xprt *xprt)
964 {
965 	struct svcxprt_rdma *rdma =
966 		container_of(xprt, struct svcxprt_rdma, sc_xprt);
967 	INIT_WORK(&rdma->sc_work, __svc_rdma_free);
968 	queue_work(svc_rdma_wq, &rdma->sc_work);
969 }
970 
971 static int svc_rdma_has_wspace(struct svc_xprt *xprt)
972 {
973 	struct svcxprt_rdma *rdma =
974 		container_of(xprt, struct svcxprt_rdma, sc_xprt);
975 
976 	/*
977 	 * If there are already waiters on the SQ,
978 	 * return false.
979 	 */
980 	if (waitqueue_active(&rdma->sc_send_wait))
981 		return 0;
982 
983 	/* Otherwise return true. */
984 	return 1;
985 }
986 
987 static int svc_rdma_secure_port(struct svc_rqst *rqstp)
988 {
989 	return 1;
990 }
991 
992 static void svc_rdma_kill_temp_xprt(struct svc_xprt *xprt)
993 {
994 }
995 
996 int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
997 {
998 	struct ib_send_wr *bad_wr, *n_wr;
999 	int wr_count;
1000 	int i;
1001 	int ret;
1002 
1003 	if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
1004 		return -ENOTCONN;
1005 
1006 	wr_count = 1;
1007 	for (n_wr = wr->next; n_wr; n_wr = n_wr->next)
1008 		wr_count++;
1009 
1010 	/* If the SQ is full, wait until an SQ entry is available */
1011 	while (1) {
1012 		if ((atomic_sub_return(wr_count, &xprt->sc_sq_avail) < 0)) {
1013 			atomic_inc(&rdma_stat_sq_starve);
1014 
1015 			/* Wait until SQ WR available if SQ still full */
1016 			atomic_add(wr_count, &xprt->sc_sq_avail);
1017 			wait_event(xprt->sc_send_wait,
1018 				   atomic_read(&xprt->sc_sq_avail) > wr_count);
1019 			if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
1020 				return -ENOTCONN;
1021 			continue;
1022 		}
1023 		/* Take a transport ref for each WR posted */
1024 		for (i = 0; i < wr_count; i++)
1025 			svc_xprt_get(&xprt->sc_xprt);
1026 
1027 		/* Bump used SQ WR count and post */
1028 		ret = ib_post_send(xprt->sc_qp, wr, &bad_wr);
1029 		if (ret) {
1030 			set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
1031 			for (i = 0; i < wr_count; i ++)
1032 				svc_xprt_put(&xprt->sc_xprt);
1033 			dprintk("svcrdma: failed to post SQ WR rc=%d\n", ret);
1034 			dprintk("    sc_sq_avail=%d, sc_sq_depth=%d\n",
1035 				atomic_read(&xprt->sc_sq_avail),
1036 				xprt->sc_sq_depth);
1037 			wake_up(&xprt->sc_send_wait);
1038 		}
1039 		break;
1040 	}
1041 	return ret;
1042 }
1043