xref: /openbmc/linux/drivers/infiniband/sw/rdmavt/qp.c (revision bfbac097)
1 /*
2  * Copyright(c) 2015 Intel Corporation.
3  *
4  * This file is provided under a dual BSD/GPLv2 license.  When using or
5  * redistributing this file, you may do so under either license.
6  *
7  * GPL LICENSE SUMMARY
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of version 2 of the GNU General Public License as
11  * published by the Free Software Foundation.
12  *
13  * This program is distributed in the hope that it will be useful, but
14  * WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * General Public License for more details.
17  *
18  * BSD LICENSE
19  *
20  * Redistribution and use in source and binary forms, with or without
21  * modification, are permitted provided that the following conditions
22  * are met:
23  *
24  *  - Redistributions of source code must retain the above copyright
25  *    notice, this list of conditions and the following disclaimer.
26  *  - Redistributions in binary form must reproduce the above copyright
27  *    notice, this list of conditions and the following disclaimer in
28  *    the documentation and/or other materials provided with the
29  *    distribution.
30  *  - Neither the name of Intel Corporation nor the names of its
31  *    contributors may be used to endorse or promote products derived
32  *    from this software without specific prior written permission.
33  *
34  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
36  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
37  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
38  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
39  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
40  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
41  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
42  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
43  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
44  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
45  *
46  */
47 
48 #include <linux/bitops.h>
49 #include <linux/lockdep.h>
50 #include <linux/vmalloc.h>
51 #include <linux/slab.h>
52 #include <rdma/ib_verbs.h>
53 #include "qp.h"
54 #include "vt.h"
55 
56 /*
57  * Note that it is OK to post send work requests in the SQE and ERR
58  * states; rvt_do_send() will process them and generate error
59  * completions as per IB 1.2 C10-96.
60  */
61 const int ib_rvt_state_ops[IB_QPS_ERR + 1] = {
62 	[IB_QPS_RESET] = 0,
63 	[IB_QPS_INIT] = RVT_POST_RECV_OK,
64 	[IB_QPS_RTR] = RVT_POST_RECV_OK | RVT_PROCESS_RECV_OK,
65 	[IB_QPS_RTS] = RVT_POST_RECV_OK | RVT_PROCESS_RECV_OK |
66 	    RVT_POST_SEND_OK | RVT_PROCESS_SEND_OK |
67 	    RVT_PROCESS_NEXT_SEND_OK,
68 	[IB_QPS_SQD] = RVT_POST_RECV_OK | RVT_PROCESS_RECV_OK |
69 	    RVT_POST_SEND_OK | RVT_PROCESS_SEND_OK,
70 	[IB_QPS_SQE] = RVT_POST_RECV_OK | RVT_PROCESS_RECV_OK |
71 	    RVT_POST_SEND_OK | RVT_FLUSH_SEND,
72 	[IB_QPS_ERR] = RVT_POST_RECV_OK | RVT_FLUSH_RECV |
73 	    RVT_POST_SEND_OK | RVT_FLUSH_SEND,
74 };
75 EXPORT_SYMBOL(ib_rvt_state_ops);
76 
77 static void get_map_page(struct rvt_qpn_table *qpt,
78 			 struct rvt_qpn_map *map,
79 			 gfp_t gfp)
80 {
81 	unsigned long page = get_zeroed_page(gfp);
82 
83 	/*
84 	 * Free the page if someone raced with us installing it.
85 	 */
86 
87 	spin_lock(&qpt->lock);
88 	if (map->page)
89 		free_page(page);
90 	else
91 		map->page = (void *)page;
92 	spin_unlock(&qpt->lock);
93 }
94 
95 /**
96  * init_qpn_table - initialize the QP number table for a device
97  * @qpt: the QPN table
98  */
99 static int init_qpn_table(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt)
100 {
101 	u32 offset, i;
102 	struct rvt_qpn_map *map;
103 	int ret = 0;
104 
105 	if (!(rdi->dparms.qpn_res_end >= rdi->dparms.qpn_res_start))
106 		return -EINVAL;
107 
108 	spin_lock_init(&qpt->lock);
109 
110 	qpt->last = rdi->dparms.qpn_start;
111 	qpt->incr = rdi->dparms.qpn_inc << rdi->dparms.qos_shift;
112 
113 	/*
114 	 * Drivers may want some QPs beyond what we need for verbs let them use
115 	 * our qpn table. No need for two. Lets go ahead and mark the bitmaps
116 	 * for those. The reserved range must be *after* the range which verbs
117 	 * will pick from.
118 	 */
119 
120 	/* Figure out number of bit maps needed before reserved range */
121 	qpt->nmaps = rdi->dparms.qpn_res_start / RVT_BITS_PER_PAGE;
122 
123 	/* This should always be zero */
124 	offset = rdi->dparms.qpn_res_start & RVT_BITS_PER_PAGE_MASK;
125 
126 	/* Starting with the first reserved bit map */
127 	map = &qpt->map[qpt->nmaps];
128 
129 	rvt_pr_info(rdi, "Reserving QPNs from 0x%x to 0x%x for non-verbs use\n",
130 		    rdi->dparms.qpn_res_start, rdi->dparms.qpn_res_end);
131 	for (i = rdi->dparms.qpn_res_start; i <= rdi->dparms.qpn_res_end; i++) {
132 		if (!map->page) {
133 			get_map_page(qpt, map, GFP_KERNEL);
134 			if (!map->page) {
135 				ret = -ENOMEM;
136 				break;
137 			}
138 		}
139 		set_bit(offset, map->page);
140 		offset++;
141 		if (offset == RVT_BITS_PER_PAGE) {
142 			/* next page */
143 			qpt->nmaps++;
144 			map++;
145 			offset = 0;
146 		}
147 	}
148 	return ret;
149 }
150 
151 /**
152  * free_qpn_table - free the QP number table for a device
153  * @qpt: the QPN table
154  */
155 static void free_qpn_table(struct rvt_qpn_table *qpt)
156 {
157 	int i;
158 
159 	for (i = 0; i < ARRAY_SIZE(qpt->map); i++)
160 		free_page((unsigned long)qpt->map[i].page);
161 }
162 
163 int rvt_driver_qp_init(struct rvt_dev_info *rdi)
164 {
165 	int i;
166 	int ret = -ENOMEM;
167 
168 	if (rdi->flags & RVT_FLAG_QP_INIT_DRIVER) {
169 		rvt_pr_info(rdi, "Driver is doing QP init.\n");
170 		return 0;
171 	}
172 
173 	if (!rdi->dparms.qp_table_size)
174 		return -EINVAL;
175 
176 	/*
177 	 * If driver is not doing any QP allocation then make sure it is
178 	 * providing the necessary QP functions.
179 	 */
180 	if (!rdi->driver_f.free_all_qps ||
181 	    !rdi->driver_f.qp_priv_alloc ||
182 	    !rdi->driver_f.qp_priv_free ||
183 	    !rdi->driver_f.notify_qp_reset)
184 		return -EINVAL;
185 
186 	/* allocate parent object */
187 	rdi->qp_dev = kzalloc(sizeof(*rdi->qp_dev), GFP_KERNEL);
188 	if (!rdi->qp_dev)
189 		return -ENOMEM;
190 
191 	/* allocate hash table */
192 	rdi->qp_dev->qp_table_size = rdi->dparms.qp_table_size;
193 	rdi->qp_dev->qp_table_bits = ilog2(rdi->dparms.qp_table_size);
194 	rdi->qp_dev->qp_table =
195 		kmalloc(rdi->qp_dev->qp_table_size *
196 			sizeof(*rdi->qp_dev->qp_table),
197 			GFP_KERNEL);
198 	if (!rdi->qp_dev->qp_table)
199 		goto no_qp_table;
200 
201 	for (i = 0; i < rdi->qp_dev->qp_table_size; i++)
202 		RCU_INIT_POINTER(rdi->qp_dev->qp_table[i], NULL);
203 
204 	spin_lock_init(&rdi->qp_dev->qpt_lock);
205 
206 	/* initialize qpn map */
207 	if (init_qpn_table(rdi, &rdi->qp_dev->qpn_table))
208 		goto fail_table;
209 
210 	spin_lock_init(&rdi->n_qps_lock);
211 
212 	return 0;
213 
214 fail_table:
215 	kfree(rdi->qp_dev->qp_table);
216 	free_qpn_table(&rdi->qp_dev->qpn_table);
217 
218 no_qp_table:
219 	kfree(rdi->qp_dev);
220 
221 	return ret;
222 }
223 
224 /**
225  * free_all_qps - check for QPs still in use
226  * @qpt: the QP table to empty
227  *
228  * There should not be any QPs still in use.
229  * Free memory for table.
230  */
231 static unsigned rvt_free_all_qps(struct rvt_dev_info *rdi)
232 {
233 	unsigned long flags;
234 	struct rvt_qp *qp;
235 	unsigned n, qp_inuse = 0;
236 	spinlock_t *ql; /* work around too long line below */
237 
238 	if (rdi->driver_f.free_all_qps)
239 		qp_inuse = rdi->driver_f.free_all_qps(rdi);
240 
241 	if (!rdi->qp_dev)
242 		return qp_inuse;
243 
244 	ql = &rdi->qp_dev->qpt_lock;
245 	spin_lock_irqsave(ql, flags);
246 	for (n = 0; n < rdi->qp_dev->qp_table_size; n++) {
247 		qp = rcu_dereference_protected(rdi->qp_dev->qp_table[n],
248 					       lockdep_is_held(ql));
249 		RCU_INIT_POINTER(rdi->qp_dev->qp_table[n], NULL);
250 
251 		for (; qp; qp = rcu_dereference_protected(qp->next,
252 							  lockdep_is_held(ql)))
253 			qp_inuse++;
254 	}
255 	spin_unlock_irqrestore(ql, flags);
256 	synchronize_rcu();
257 	return qp_inuse;
258 }
259 
260 void rvt_qp_exit(struct rvt_dev_info *rdi)
261 {
262 	u32 qps_inuse = rvt_free_all_qps(rdi);
263 
264 	if (qps_inuse)
265 		rvt_pr_err(rdi, "QP memory leak! %u still in use\n",
266 			   qps_inuse);
267 	if (!rdi->qp_dev)
268 		return;
269 
270 	if (rdi->flags & RVT_FLAG_QP_INIT_DRIVER)
271 		return; /* driver did the qp init so nothing else to do */
272 
273 	kfree(rdi->qp_dev->qp_table);
274 	free_qpn_table(&rdi->qp_dev->qpn_table);
275 	kfree(rdi->qp_dev);
276 }
277 
278 static inline unsigned mk_qpn(struct rvt_qpn_table *qpt,
279 			      struct rvt_qpn_map *map, unsigned off)
280 {
281 	return (map - qpt->map) * RVT_BITS_PER_PAGE + off;
282 }
283 
284 /*
285  * Allocate the next available QPN or
286  * zero/one for QP type IB_QPT_SMI/IB_QPT_GSI.
287  */
288 static int alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt,
289 		     enum ib_qp_type type, u8 port, gfp_t gfp)
290 {
291 	u32 i, offset, max_scan, qpn;
292 	struct rvt_qpn_map *map;
293 	u32 ret;
294 
295 	if (rdi->driver_f.alloc_qpn)
296 		return rdi->driver_f.alloc_qpn(rdi, qpt, type, port,
297 					       GFP_KERNEL);
298 
299 	if (type == IB_QPT_SMI || type == IB_QPT_GSI) {
300 		unsigned n;
301 
302 		ret = type == IB_QPT_GSI;
303 		n = 1 << (ret + 2 * (port - 1));
304 		spin_lock(&qpt->lock);
305 		if (qpt->flags & n)
306 			ret = -EINVAL;
307 		else
308 			qpt->flags |= n;
309 		spin_unlock(&qpt->lock);
310 		goto bail;
311 	}
312 
313 	qpn = qpt->last + qpt->incr;
314 	if (qpn >= RVT_QPN_MAX)
315 		qpn = qpt->incr | ((qpt->last & 1) ^ 1);
316 	/* offset carries bit 0 */
317 	offset = qpn & RVT_BITS_PER_PAGE_MASK;
318 	map = &qpt->map[qpn / RVT_BITS_PER_PAGE];
319 	max_scan = qpt->nmaps - !offset;
320 	for (i = 0;;) {
321 		if (unlikely(!map->page)) {
322 			get_map_page(qpt, map, gfp);
323 			if (unlikely(!map->page))
324 				break;
325 		}
326 		do {
327 			if (!test_and_set_bit(offset, map->page)) {
328 				qpt->last = qpn;
329 				ret = qpn;
330 				goto bail;
331 			}
332 			offset += qpt->incr;
333 			/*
334 			 * This qpn might be bogus if offset >= BITS_PER_PAGE.
335 			 * That is OK.   It gets re-assigned below
336 			 */
337 			qpn = mk_qpn(qpt, map, offset);
338 		} while (offset < RVT_BITS_PER_PAGE && qpn < RVT_QPN_MAX);
339 		/*
340 		 * In order to keep the number of pages allocated to a
341 		 * minimum, we scan the all existing pages before increasing
342 		 * the size of the bitmap table.
343 		 */
344 		if (++i > max_scan) {
345 			if (qpt->nmaps == RVT_QPNMAP_ENTRIES)
346 				break;
347 			map = &qpt->map[qpt->nmaps++];
348 			/* start at incr with current bit 0 */
349 			offset = qpt->incr | (offset & 1);
350 		} else if (map < &qpt->map[qpt->nmaps]) {
351 			++map;
352 			/* start at incr with current bit 0 */
353 			offset = qpt->incr | (offset & 1);
354 		} else {
355 			map = &qpt->map[0];
356 			/* wrap to first map page, invert bit 0 */
357 			offset = qpt->incr | ((offset & 1) ^ 1);
358 		}
359 		/* there can be no bits at shift and below */
360 		WARN_ON(offset & (rdi->dparms.qos_shift - 1));
361 		qpn = mk_qpn(qpt, map, offset);
362 	}
363 
364 	ret = -ENOMEM;
365 
366 bail:
367 	return ret;
368 }
369 
370 static void free_qpn(struct rvt_qpn_table *qpt, u32 qpn)
371 {
372 	struct rvt_qpn_map *map;
373 
374 	map = qpt->map + qpn / RVT_BITS_PER_PAGE;
375 	if (map->page)
376 		clear_bit(qpn & RVT_BITS_PER_PAGE_MASK, map->page);
377 }
378 
379 /**
380  * reset_qp - initialize the QP state to the reset state
381  * @qp: the QP to reset
382  * @type: the QP type
383  */
384 void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
385 		  enum ib_qp_type type)
386 {
387 	qp->remote_qpn = 0;
388 	qp->qkey = 0;
389 	qp->qp_access_flags = 0;
390 
391 	/*
392 	 * Let driver do anything it needs to for a new/reset qp
393 	 */
394 	rdi->driver_f.notify_qp_reset(qp);
395 
396 	qp->s_flags &= RVT_S_SIGNAL_REQ_WR;
397 	qp->s_hdrwords = 0;
398 	qp->s_wqe = NULL;
399 	qp->s_draining = 0;
400 	qp->s_next_psn = 0;
401 	qp->s_last_psn = 0;
402 	qp->s_sending_psn = 0;
403 	qp->s_sending_hpsn = 0;
404 	qp->s_psn = 0;
405 	qp->r_psn = 0;
406 	qp->r_msn = 0;
407 	if (type == IB_QPT_RC) {
408 		qp->s_state = IB_OPCODE_RC_SEND_LAST;
409 		qp->r_state = IB_OPCODE_RC_SEND_LAST;
410 	} else {
411 		qp->s_state = IB_OPCODE_UC_SEND_LAST;
412 		qp->r_state = IB_OPCODE_UC_SEND_LAST;
413 	}
414 	qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
415 	qp->r_nak_state = 0;
416 	qp->r_aflags = 0;
417 	qp->r_flags = 0;
418 	qp->s_head = 0;
419 	qp->s_tail = 0;
420 	qp->s_cur = 0;
421 	qp->s_acked = 0;
422 	qp->s_last = 0;
423 	qp->s_ssn = 1;
424 	qp->s_lsn = 0;
425 	qp->s_mig_state = IB_MIG_MIGRATED;
426 	memset(qp->s_ack_queue, 0, sizeof(qp->s_ack_queue));
427 	qp->r_head_ack_queue = 0;
428 	qp->s_tail_ack_queue = 0;
429 	qp->s_num_rd_atomic = 0;
430 	if (qp->r_rq.wq) {
431 		qp->r_rq.wq->head = 0;
432 		qp->r_rq.wq->tail = 0;
433 	}
434 	qp->r_sge.num_sge = 0;
435 }
436 EXPORT_SYMBOL(rvt_reset_qp);
437 
438 /**
439  * rvt_create_qp - create a queue pair for a device
440  * @ibpd: the protection domain who's device we create the queue pair for
441  * @init_attr: the attributes of the queue pair
442  * @udata: user data for libibverbs.so
443  *
444  * Queue pair creation is mostly an rvt issue. However, drivers have their own
445  * unique idea of what queue pair numbers mean. For instance there is a reserved
446  * range for PSM.
447  *
448  * Returns the queue pair on success, otherwise returns an errno.
449  *
450  * Called by the ib_create_qp() core verbs function.
451  */
452 struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
453 			    struct ib_qp_init_attr *init_attr,
454 			    struct ib_udata *udata)
455 {
456 	struct rvt_qp *qp;
457 	int err;
458 	struct rvt_swqe *swq = NULL;
459 	size_t sz;
460 	size_t sg_list_sz;
461 	struct ib_qp *ret = ERR_PTR(-ENOMEM);
462 	struct rvt_dev_info *rdi = ib_to_rvt(ibpd->device);
463 	void *priv = NULL;
464 	gfp_t gfp;
465 
466 	if (!rdi)
467 		return ERR_PTR(-EINVAL);
468 
469 	if (init_attr->cap.max_send_sge > rdi->dparms.props.max_sge ||
470 	    init_attr->cap.max_send_wr > rdi->dparms.props.max_qp_wr ||
471 	    init_attr->create_flags & ~(IB_QP_CREATE_USE_GFP_NOIO))
472 		return ERR_PTR(-EINVAL);
473 
474 	/* GFP_NOIO is applicable to RC QP's only */
475 
476 	if (init_attr->create_flags & IB_QP_CREATE_USE_GFP_NOIO &&
477 	    init_attr->qp_type != IB_QPT_RC)
478 		return ERR_PTR(-EINVAL);
479 
480 	gfp = init_attr->create_flags & IB_QP_CREATE_USE_GFP_NOIO ?
481 						GFP_NOIO : GFP_KERNEL;
482 
483 	/* Check receive queue parameters if no SRQ is specified. */
484 	if (!init_attr->srq) {
485 		if (init_attr->cap.max_recv_sge > rdi->dparms.props.max_sge ||
486 		    init_attr->cap.max_recv_wr > rdi->dparms.props.max_qp_wr)
487 			return ERR_PTR(-EINVAL);
488 
489 		if (init_attr->cap.max_send_sge +
490 		    init_attr->cap.max_send_wr +
491 		    init_attr->cap.max_recv_sge +
492 		    init_attr->cap.max_recv_wr == 0)
493 			return ERR_PTR(-EINVAL);
494 	}
495 
496 	switch (init_attr->qp_type) {
497 	case IB_QPT_SMI:
498 	case IB_QPT_GSI:
499 		if (init_attr->port_num == 0 ||
500 		    init_attr->port_num > ibpd->device->phys_port_cnt)
501 			return ERR_PTR(-EINVAL);
502 	case IB_QPT_UC:
503 	case IB_QPT_RC:
504 	case IB_QPT_UD:
505 		sz = sizeof(struct rvt_sge) *
506 			init_attr->cap.max_send_sge +
507 			sizeof(struct rvt_swqe);
508 		if (gfp == GFP_NOIO)
509 			swq = __vmalloc(
510 				(init_attr->cap.max_send_wr + 1) * sz,
511 				gfp, PAGE_KERNEL);
512 		else
513 			swq = vmalloc(
514 				(init_attr->cap.max_send_wr + 1) * sz);
515 		if (!swq)
516 			return ERR_PTR(-ENOMEM);
517 
518 		sz = sizeof(*qp);
519 		sg_list_sz = 0;
520 		if (init_attr->srq) {
521 			struct rvt_srq *srq = ibsrq_to_rvtsrq(init_attr->srq);
522 
523 			if (srq->rq.max_sge > 1)
524 				sg_list_sz = sizeof(*qp->r_sg_list) *
525 					(srq->rq.max_sge - 1);
526 		} else if (init_attr->cap.max_recv_sge > 1)
527 			sg_list_sz = sizeof(*qp->r_sg_list) *
528 				(init_attr->cap.max_recv_sge - 1);
529 		qp = kzalloc(sz + sg_list_sz, gfp);
530 		if (!qp)
531 			goto bail_swq;
532 
533 		RCU_INIT_POINTER(qp->next, NULL);
534 
535 		/*
536 		 * Driver needs to set up it's private QP structure and do any
537 		 * initialization that is needed.
538 		 */
539 		priv = rdi->driver_f.qp_priv_alloc(rdi, qp, gfp);
540 		if (!priv)
541 			goto bail_qp;
542 		qp->priv = priv;
543 		qp->timeout_jiffies =
544 			usecs_to_jiffies((4096UL * (1UL << qp->timeout)) /
545 				1000UL);
546 		if (init_attr->srq) {
547 			sz = 0;
548 		} else {
549 			qp->r_rq.size = init_attr->cap.max_recv_wr + 1;
550 			qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
551 			sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) +
552 				sizeof(struct rvt_rwqe);
553 			if (udata)
554 				qp->r_rq.wq = vmalloc_user(
555 						sizeof(struct rvt_rwq) +
556 						qp->r_rq.size * sz);
557 			else if (gfp == GFP_NOIO)
558 				qp->r_rq.wq = __vmalloc(
559 						sizeof(struct rvt_rwq) +
560 						qp->r_rq.size * sz,
561 						gfp, PAGE_KERNEL);
562 			else
563 				qp->r_rq.wq = vmalloc(
564 						sizeof(struct rvt_rwq) +
565 						qp->r_rq.size * sz);
566 			if (!qp->r_rq.wq)
567 				goto bail_driver_priv;
568 		}
569 
570 		/*
571 		 * ib_create_qp() will initialize qp->ibqp
572 		 * except for qp->ibqp.qp_num.
573 		 */
574 		spin_lock_init(&qp->r_lock);
575 		spin_lock_init(&qp->s_lock);
576 		spin_lock_init(&qp->r_rq.lock);
577 		atomic_set(&qp->refcount, 0);
578 		init_waitqueue_head(&qp->wait);
579 		init_timer(&qp->s_timer);
580 		qp->s_timer.data = (unsigned long)qp;
581 		INIT_LIST_HEAD(&qp->rspwait);
582 		qp->state = IB_QPS_RESET;
583 		qp->s_wq = swq;
584 		qp->s_size = init_attr->cap.max_send_wr + 1;
585 		qp->s_max_sge = init_attr->cap.max_send_sge;
586 		if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR)
587 			qp->s_flags = RVT_S_SIGNAL_REQ_WR;
588 
589 		err = alloc_qpn(rdi, &rdi->qp_dev->qpn_table,
590 				init_attr->qp_type,
591 				init_attr->port_num, gfp);
592 		if (err < 0) {
593 			ret = ERR_PTR(err);
594 			goto bail_rq_wq;
595 		}
596 		qp->ibqp.qp_num = err;
597 		qp->port_num = init_attr->port_num;
598 		rvt_reset_qp(rdi, qp, init_attr->qp_type);
599 		break;
600 
601 	default:
602 		/* Don't support raw QPs */
603 		return ERR_PTR(-EINVAL);
604 	}
605 
606 	init_attr->cap.max_inline_data = 0;
607 
608 	/*
609 	 * Return the address of the RWQ as the offset to mmap.
610 	 * See rvt_mmap() for details.
611 	 */
612 	if (udata && udata->outlen >= sizeof(__u64)) {
613 		if (!qp->r_rq.wq) {
614 			__u64 offset = 0;
615 
616 			err = ib_copy_to_udata(udata, &offset,
617 					       sizeof(offset));
618 			if (err) {
619 				ret = ERR_PTR(err);
620 				goto bail_qpn;
621 			}
622 		} else {
623 			u32 s = sizeof(struct rvt_rwq) + qp->r_rq.size * sz;
624 
625 			qp->ip = rvt_create_mmap_info(rdi, s,
626 						      ibpd->uobject->context,
627 						      qp->r_rq.wq);
628 			if (!qp->ip) {
629 				ret = ERR_PTR(-ENOMEM);
630 				goto bail_qpn;
631 			}
632 
633 			err = ib_copy_to_udata(udata, &qp->ip->offset,
634 					       sizeof(qp->ip->offset));
635 			if (err) {
636 				ret = ERR_PTR(err);
637 				goto bail_ip;
638 			}
639 		}
640 	}
641 
642 	spin_lock(&rdi->n_qps_lock);
643 	if (rdi->n_qps_allocated == rdi->dparms.props.max_qp) {
644 		spin_unlock(&rdi->n_qps_lock);
645 		ret = ERR_PTR(-ENOMEM);
646 		goto bail_ip;
647 	}
648 
649 	rdi->n_qps_allocated++;
650 	spin_unlock(&rdi->n_qps_lock);
651 
652 	if (qp->ip) {
653 		spin_lock_irq(&rdi->pending_lock);
654 		list_add(&qp->ip->pending_mmaps, &rdi->pending_mmaps);
655 		spin_unlock_irq(&rdi->pending_lock);
656 	}
657 
658 	ret = &qp->ibqp;
659 
660 	/*
661 	 * We have our QP and its good, now keep track of what types of opcodes
662 	 * can be processed on this QP. We do this by keeping track of what the
663 	 * 3 high order bits of the opcode are.
664 	 */
665 	switch (init_attr->qp_type) {
666 	case IB_QPT_SMI:
667 	case IB_QPT_GSI:
668 	case IB_QPT_UD:
669 		qp->allowed_ops = IB_OPCODE_UD_SEND_ONLY & RVT_OPCODE_QP_MASK;
670 		break;
671 	case IB_QPT_RC:
672 		qp->allowed_ops = IB_OPCODE_RC_SEND_ONLY & RVT_OPCODE_QP_MASK;
673 		break;
674 	case IB_QPT_UC:
675 		qp->allowed_ops = IB_OPCODE_UC_SEND_ONLY & RVT_OPCODE_QP_MASK;
676 		break;
677 	default:
678 		ret = ERR_PTR(-EINVAL);
679 		goto bail_ip;
680 	}
681 
682 	return ret;
683 
684 bail_ip:
685 	kref_put(&qp->ip->ref, rvt_release_mmap_info);
686 
687 bail_qpn:
688 	free_qpn(&rdi->qp_dev->qpn_table, qp->ibqp.qp_num);
689 
690 bail_rq_wq:
691 	vfree(qp->r_rq.wq);
692 
693 bail_driver_priv:
694 	rdi->driver_f.qp_priv_free(rdi, qp);
695 
696 bail_qp:
697 	kfree(qp);
698 
699 bail_swq:
700 	vfree(swq);
701 
702 	return ret;
703 }
704 
705 /**
706  * qib_modify_qp - modify the attributes of a queue pair
707  * @ibqp: the queue pair who's attributes we're modifying
708  * @attr: the new attributes
709  * @attr_mask: the mask of attributes to modify
710  * @udata: user data for libibverbs.so
711  *
712  * Returns 0 on success, otherwise returns an errno.
713  */
714 int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
715 		  int attr_mask, struct ib_udata *udata)
716 {
717 	/*
718 	 * VT-DRIVER-API: qp_mtu()
719 	 * OPA devices have a per VL MTU the driver has a mapping of IB SL to SC
720 	 * to VL and the mapping table of MTUs per VL. This is not something
721 	 * that IB has and should not live in the rvt.
722 	 */
723 	return -EOPNOTSUPP;
724 }
725 
726 /**
727  * rvt_destroy_qp - destroy a queue pair
728  * @ibqp: the queue pair to destroy
729  *
730  * Returns 0 on success.
731  *
732  * Note that this can be called while the QP is actively sending or
733  * receiving!
734  */
735 int rvt_destroy_qp(struct ib_qp *ibqp)
736 {
737 	/*
738 	 * VT-DRIVER-API: qp_flush()
739 	 * Driver provies a mechanism to flush and wait for that flush to
740 	 * finish.
741 	 */
742 
743 	return -EOPNOTSUPP;
744 }
745 
746 int rvt_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
747 		 int attr_mask, struct ib_qp_init_attr *init_attr)
748 {
749 	return -EOPNOTSUPP;
750 }
751 
752 /**
753  * rvt_post_receive - post a receive on a QP
754  * @ibqp: the QP to post the receive on
755  * @wr: the WR to post
756  * @bad_wr: the first bad WR is put here
757  *
758  * This may be called from interrupt context.
759  */
760 int rvt_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
761 		  struct ib_recv_wr **bad_wr)
762 {
763 	/*
764 	 * When a packet arrives the driver needs to call up to rvt to process
765 	 * the packet. The UD, RC, UC processing will be done in rvt, however
766 	 * the driver should be able to override this if it so choses. Perhaps a
767 	 * set of function pointers set up at registration time.
768 	 */
769 
770 	return -EOPNOTSUPP;
771 }
772 
773 /**
774  * rvt_post_one_wr - post one RC, UC, or UD send work request
775  * @qp: the QP to post on
776  * @wr: the work request to send
777  */
778 static int rvt_post_one_wr(struct rvt_qp *qp, struct ib_send_wr *wr)
779 {
780 	struct rvt_swqe *wqe;
781 	u32 next;
782 	int i;
783 	int j;
784 	int acc;
785 	struct rvt_lkey_table *rkt;
786 	struct rvt_pd *pd;
787 	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
788 
789 	/* IB spec says that num_sge == 0 is OK. */
790 	if (unlikely(wr->num_sge > qp->s_max_sge))
791 		return -EINVAL;
792 
793 	/*
794 	 * Don't allow RDMA reads or atomic operations on UC or
795 	 * undefined operations.
796 	 * Make sure buffer is large enough to hold the result for atomics.
797 	 */
798 	if (qp->ibqp.qp_type == IB_QPT_UC) {
799 		if ((unsigned)wr->opcode >= IB_WR_RDMA_READ)
800 			return -EINVAL;
801 	} else if (qp->ibqp.qp_type != IB_QPT_RC) {
802 		/* Check IB_QPT_SMI, IB_QPT_GSI, IB_QPT_UD opcode */
803 		if (wr->opcode != IB_WR_SEND &&
804 		    wr->opcode != IB_WR_SEND_WITH_IMM)
805 			return -EINVAL;
806 		/* Check UD destination address PD */
807 		if (qp->ibqp.pd != ud_wr(wr)->ah->pd)
808 			return -EINVAL;
809 	} else if ((unsigned)wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD) {
810 		return -EINVAL;
811 	} else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP &&
812 		   (wr->num_sge == 0 ||
813 		    wr->sg_list[0].length < sizeof(u64) ||
814 		    wr->sg_list[0].addr & (sizeof(u64) - 1))) {
815 		return -EINVAL;
816 	} else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic) {
817 		return -EINVAL;
818 	}
819 
820 	next = qp->s_head + 1;
821 	if (next >= qp->s_size)
822 		next = 0;
823 	if (next == qp->s_last)
824 		return -ENOMEM;
825 
826 	rkt = &rdi->lkey_table;
827 	pd = ibpd_to_rvtpd(qp->ibqp.pd);
828 	wqe = rvt_get_swqe_ptr(qp, qp->s_head);
829 
830 	if (qp->ibqp.qp_type != IB_QPT_UC &&
831 	    qp->ibqp.qp_type != IB_QPT_RC)
832 		memcpy(&wqe->ud_wr, ud_wr(wr), sizeof(wqe->ud_wr));
833 	else if (wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM ||
834 		 wr->opcode == IB_WR_RDMA_WRITE ||
835 		 wr->opcode == IB_WR_RDMA_READ)
836 		memcpy(&wqe->rdma_wr, rdma_wr(wr), sizeof(wqe->rdma_wr));
837 	else if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
838 		 wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
839 		memcpy(&wqe->atomic_wr, atomic_wr(wr), sizeof(wqe->atomic_wr));
840 	else
841 		memcpy(&wqe->wr, wr, sizeof(wqe->wr));
842 
843 	wqe->length = 0;
844 	j = 0;
845 	if (wr->num_sge) {
846 		acc = wr->opcode >= IB_WR_RDMA_READ ?
847 			IB_ACCESS_LOCAL_WRITE : 0;
848 		for (i = 0; i < wr->num_sge; i++) {
849 			u32 length = wr->sg_list[i].length;
850 			int ok;
851 
852 			if (length == 0)
853 				continue;
854 			ok = rvt_lkey_ok(rkt, pd, &wqe->sg_list[j],
855 					 &wr->sg_list[i], acc);
856 			if (!ok)
857 				goto bail_inval_free;
858 			wqe->length += length;
859 			j++;
860 		}
861 		wqe->wr.num_sge = j;
862 	}
863 	if (qp->ibqp.qp_type == IB_QPT_UC ||
864 	    qp->ibqp.qp_type == IB_QPT_RC) {
865 		if (wqe->length > 0x80000000U)
866 			goto bail_inval_free;
867 	} else {
868 		atomic_inc(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount);
869 	}
870 	wqe->ssn = qp->s_ssn++;
871 	qp->s_head = next;
872 
873 	return 0;
874 
875 bail_inval_free:
876 	/* release mr holds */
877 	while (j) {
878 		struct rvt_sge *sge = &wqe->sg_list[--j];
879 
880 		rvt_put_mr(sge->mr);
881 	}
882 	return -EINVAL;
883 }
884 
885 /**
886  * rvt_post_send - post a send on a QP
887  * @ibqp: the QP to post the send on
888  * @wr: the list of work requests to post
889  * @bad_wr: the first bad WR is put here
890  *
891  * This may be called from interrupt context.
892  */
893 int rvt_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
894 		  struct ib_send_wr **bad_wr)
895 {
896 	struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
897 	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
898 	unsigned long flags = 0;
899 	int call_send;
900 	unsigned nreq = 0;
901 	int err = 0;
902 
903 	spin_lock_irqsave(&qp->s_lock, flags);
904 
905 	/*
906 	 * Ensure QP state is such that we can send. If not bail out early,
907 	 * there is no need to do this every time we post a send.
908 	 */
909 	if (unlikely(!(ib_rvt_state_ops[qp->state] & RVT_POST_SEND_OK))) {
910 		spin_unlock_irqrestore(&qp->s_lock, flags);
911 		return -EINVAL;
912 	}
913 
914 	/*
915 	 * If the send queue is empty, and we only have a single WR then just go
916 	 * ahead and kick the send engine into gear. Otherwise we will always
917 	 * just schedule the send to happen later.
918 	 */
919 	call_send = qp->s_head == ACCESS_ONCE(qp->s_last) && !wr->next;
920 
921 	for (; wr; wr = wr->next) {
922 		err = rvt_post_one_wr(qp, wr);
923 		if (unlikely(err)) {
924 			*bad_wr = wr;
925 			goto bail;
926 		}
927 		nreq++;
928 	}
929 bail:
930 	if (nreq && !call_send)
931 		rdi->driver_f.schedule_send(qp);
932 	spin_unlock_irqrestore(&qp->s_lock, flags);
933 	if (nreq && call_send)
934 		rdi->driver_f.do_send(qp);
935 	return err;
936 }
937 
938 /**
939  * rvt_post_srq_receive - post a receive on a shared receive queue
940  * @ibsrq: the SRQ to post the receive on
941  * @wr: the list of work requests to post
942  * @bad_wr: A pointer to the first WR to cause a problem is put here
943  *
944  * This may be called from interrupt context.
945  */
946 int rvt_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
947 		      struct ib_recv_wr **bad_wr)
948 {
949 	return -EOPNOTSUPP;
950 }
951