xref: /openbmc/linux/drivers/infiniband/sw/rdmavt/qp.c (revision 2891f2d5)
1 /*
2  * Copyright(c) 2016 Intel Corporation.
3  *
4  * This file is provided under a dual BSD/GPLv2 license.  When using or
5  * redistributing this file, you may do so under either license.
6  *
7  * GPL LICENSE SUMMARY
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of version 2 of the GNU General Public License as
11  * published by the Free Software Foundation.
12  *
13  * This program is distributed in the hope that it will be useful, but
14  * WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * General Public License for more details.
17  *
18  * BSD LICENSE
19  *
20  * Redistribution and use in source and binary forms, with or without
21  * modification, are permitted provided that the following conditions
22  * are met:
23  *
24  *  - Redistributions of source code must retain the above copyright
25  *    notice, this list of conditions and the following disclaimer.
26  *  - Redistributions in binary form must reproduce the above copyright
27  *    notice, this list of conditions and the following disclaimer in
28  *    the documentation and/or other materials provided with the
29  *    distribution.
30  *  - Neither the name of Intel Corporation nor the names of its
31  *    contributors may be used to endorse or promote products derived
32  *    from this software without specific prior written permission.
33  *
34  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
36  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
37  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
38  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
39  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
40  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
41  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
42  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
43  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
44  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
45  *
46  */
47 
48 #include <linux/hash.h>
49 #include <linux/bitops.h>
50 #include <linux/lockdep.h>
51 #include <linux/vmalloc.h>
52 #include <linux/slab.h>
53 #include <rdma/ib_verbs.h>
54 #include <rdma/ib_hdrs.h>
55 #include "qp.h"
56 #include "vt.h"
57 #include "trace.h"
58 
59 static void rvt_rc_timeout(unsigned long arg);
60 
61 /*
62  * Convert the AETH RNR timeout code into the number of microseconds.
63  */
64 static const u32 ib_rvt_rnr_table[32] = {
65 	655360, /* 00: 655.36 */
66 	10,     /* 01:    .01 */
67 	20,     /* 02     .02 */
68 	30,     /* 03:    .03 */
69 	40,     /* 04:    .04 */
70 	60,     /* 05:    .06 */
71 	80,     /* 06:    .08 */
72 	120,    /* 07:    .12 */
73 	160,    /* 08:    .16 */
74 	240,    /* 09:    .24 */
75 	320,    /* 0A:    .32 */
76 	480,    /* 0B:    .48 */
77 	640,    /* 0C:    .64 */
78 	960,    /* 0D:    .96 */
79 	1280,   /* 0E:   1.28 */
80 	1920,   /* 0F:   1.92 */
81 	2560,   /* 10:   2.56 */
82 	3840,   /* 11:   3.84 */
83 	5120,   /* 12:   5.12 */
84 	7680,   /* 13:   7.68 */
85 	10240,  /* 14:  10.24 */
86 	15360,  /* 15:  15.36 */
87 	20480,  /* 16:  20.48 */
88 	30720,  /* 17:  30.72 */
89 	40960,  /* 18:  40.96 */
90 	61440,  /* 19:  61.44 */
91 	81920,  /* 1A:  81.92 */
92 	122880, /* 1B: 122.88 */
93 	163840, /* 1C: 163.84 */
94 	245760, /* 1D: 245.76 */
95 	327680, /* 1E: 327.68 */
96 	491520  /* 1F: 491.52 */
97 };
98 
99 /*
100  * Note that it is OK to post send work requests in the SQE and ERR
101  * states; rvt_do_send() will process them and generate error
102  * completions as per IB 1.2 C10-96.
103  */
104 const int ib_rvt_state_ops[IB_QPS_ERR + 1] = {
105 	[IB_QPS_RESET] = 0,
106 	[IB_QPS_INIT] = RVT_POST_RECV_OK,
107 	[IB_QPS_RTR] = RVT_POST_RECV_OK | RVT_PROCESS_RECV_OK,
108 	[IB_QPS_RTS] = RVT_POST_RECV_OK | RVT_PROCESS_RECV_OK |
109 	    RVT_POST_SEND_OK | RVT_PROCESS_SEND_OK |
110 	    RVT_PROCESS_NEXT_SEND_OK,
111 	[IB_QPS_SQD] = RVT_POST_RECV_OK | RVT_PROCESS_RECV_OK |
112 	    RVT_POST_SEND_OK | RVT_PROCESS_SEND_OK,
113 	[IB_QPS_SQE] = RVT_POST_RECV_OK | RVT_PROCESS_RECV_OK |
114 	    RVT_POST_SEND_OK | RVT_FLUSH_SEND,
115 	[IB_QPS_ERR] = RVT_POST_RECV_OK | RVT_FLUSH_RECV |
116 	    RVT_POST_SEND_OK | RVT_FLUSH_SEND,
117 };
118 EXPORT_SYMBOL(ib_rvt_state_ops);
119 
120 /*
121  * Translate ib_wr_opcode into ib_wc_opcode.
122  */
123 const enum ib_wc_opcode ib_rvt_wc_opcode[] = {
124 	[IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
125 	[IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
126 	[IB_WR_SEND] = IB_WC_SEND,
127 	[IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
128 	[IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
129 	[IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
130 	[IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD,
131 	[IB_WR_SEND_WITH_INV] = IB_WC_SEND,
132 	[IB_WR_LOCAL_INV] = IB_WC_LOCAL_INV,
133 	[IB_WR_REG_MR] = IB_WC_REG_MR
134 };
135 EXPORT_SYMBOL(ib_rvt_wc_opcode);
136 
137 static void get_map_page(struct rvt_qpn_table *qpt,
138 			 struct rvt_qpn_map *map,
139 			 gfp_t gfp)
140 {
141 	unsigned long page = get_zeroed_page(gfp);
142 
143 	/*
144 	 * Free the page if someone raced with us installing it.
145 	 */
146 
147 	spin_lock(&qpt->lock);
148 	if (map->page)
149 		free_page(page);
150 	else
151 		map->page = (void *)page;
152 	spin_unlock(&qpt->lock);
153 }
154 
155 /**
156  * init_qpn_table - initialize the QP number table for a device
157  * @qpt: the QPN table
158  */
159 static int init_qpn_table(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt)
160 {
161 	u32 offset, i;
162 	struct rvt_qpn_map *map;
163 	int ret = 0;
164 
165 	if (!(rdi->dparms.qpn_res_end >= rdi->dparms.qpn_res_start))
166 		return -EINVAL;
167 
168 	spin_lock_init(&qpt->lock);
169 
170 	qpt->last = rdi->dparms.qpn_start;
171 	qpt->incr = rdi->dparms.qpn_inc << rdi->dparms.qos_shift;
172 
173 	/*
174 	 * Drivers may want some QPs beyond what we need for verbs let them use
175 	 * our qpn table. No need for two. Lets go ahead and mark the bitmaps
176 	 * for those. The reserved range must be *after* the range which verbs
177 	 * will pick from.
178 	 */
179 
180 	/* Figure out number of bit maps needed before reserved range */
181 	qpt->nmaps = rdi->dparms.qpn_res_start / RVT_BITS_PER_PAGE;
182 
183 	/* This should always be zero */
184 	offset = rdi->dparms.qpn_res_start & RVT_BITS_PER_PAGE_MASK;
185 
186 	/* Starting with the first reserved bit map */
187 	map = &qpt->map[qpt->nmaps];
188 
189 	rvt_pr_info(rdi, "Reserving QPNs from 0x%x to 0x%x for non-verbs use\n",
190 		    rdi->dparms.qpn_res_start, rdi->dparms.qpn_res_end);
191 	for (i = rdi->dparms.qpn_res_start; i <= rdi->dparms.qpn_res_end; i++) {
192 		if (!map->page) {
193 			get_map_page(qpt, map, GFP_KERNEL);
194 			if (!map->page) {
195 				ret = -ENOMEM;
196 				break;
197 			}
198 		}
199 		set_bit(offset, map->page);
200 		offset++;
201 		if (offset == RVT_BITS_PER_PAGE) {
202 			/* next page */
203 			qpt->nmaps++;
204 			map++;
205 			offset = 0;
206 		}
207 	}
208 	return ret;
209 }
210 
211 /**
212  * free_qpn_table - free the QP number table for a device
213  * @qpt: the QPN table
214  */
215 static void free_qpn_table(struct rvt_qpn_table *qpt)
216 {
217 	int i;
218 
219 	for (i = 0; i < ARRAY_SIZE(qpt->map); i++)
220 		free_page((unsigned long)qpt->map[i].page);
221 }
222 
223 /**
224  * rvt_driver_qp_init - Init driver qp resources
225  * @rdi: rvt dev strucutre
226  *
227  * Return: 0 on success
228  */
229 int rvt_driver_qp_init(struct rvt_dev_info *rdi)
230 {
231 	int i;
232 	int ret = -ENOMEM;
233 
234 	if (!rdi->dparms.qp_table_size)
235 		return -EINVAL;
236 
237 	/*
238 	 * If driver is not doing any QP allocation then make sure it is
239 	 * providing the necessary QP functions.
240 	 */
241 	if (!rdi->driver_f.free_all_qps ||
242 	    !rdi->driver_f.qp_priv_alloc ||
243 	    !rdi->driver_f.qp_priv_free ||
244 	    !rdi->driver_f.notify_qp_reset ||
245 	    !rdi->driver_f.notify_restart_rc)
246 		return -EINVAL;
247 
248 	/* allocate parent object */
249 	rdi->qp_dev = kzalloc_node(sizeof(*rdi->qp_dev), GFP_KERNEL,
250 				   rdi->dparms.node);
251 	if (!rdi->qp_dev)
252 		return -ENOMEM;
253 
254 	/* allocate hash table */
255 	rdi->qp_dev->qp_table_size = rdi->dparms.qp_table_size;
256 	rdi->qp_dev->qp_table_bits = ilog2(rdi->dparms.qp_table_size);
257 	rdi->qp_dev->qp_table =
258 		kmalloc_node(rdi->qp_dev->qp_table_size *
259 			     sizeof(*rdi->qp_dev->qp_table),
260 			     GFP_KERNEL, rdi->dparms.node);
261 	if (!rdi->qp_dev->qp_table)
262 		goto no_qp_table;
263 
264 	for (i = 0; i < rdi->qp_dev->qp_table_size; i++)
265 		RCU_INIT_POINTER(rdi->qp_dev->qp_table[i], NULL);
266 
267 	spin_lock_init(&rdi->qp_dev->qpt_lock);
268 
269 	/* initialize qpn map */
270 	if (init_qpn_table(rdi, &rdi->qp_dev->qpn_table))
271 		goto fail_table;
272 
273 	spin_lock_init(&rdi->n_qps_lock);
274 
275 	return 0;
276 
277 fail_table:
278 	kfree(rdi->qp_dev->qp_table);
279 	free_qpn_table(&rdi->qp_dev->qpn_table);
280 
281 no_qp_table:
282 	kfree(rdi->qp_dev);
283 
284 	return ret;
285 }
286 
287 /**
288  * free_all_qps - check for QPs still in use
289  * @qpt: the QP table to empty
290  *
291  * There should not be any QPs still in use.
292  * Free memory for table.
293  */
294 static unsigned rvt_free_all_qps(struct rvt_dev_info *rdi)
295 {
296 	unsigned long flags;
297 	struct rvt_qp *qp;
298 	unsigned n, qp_inuse = 0;
299 	spinlock_t *ql; /* work around too long line below */
300 
301 	if (rdi->driver_f.free_all_qps)
302 		qp_inuse = rdi->driver_f.free_all_qps(rdi);
303 
304 	qp_inuse += rvt_mcast_tree_empty(rdi);
305 
306 	if (!rdi->qp_dev)
307 		return qp_inuse;
308 
309 	ql = &rdi->qp_dev->qpt_lock;
310 	spin_lock_irqsave(ql, flags);
311 	for (n = 0; n < rdi->qp_dev->qp_table_size; n++) {
312 		qp = rcu_dereference_protected(rdi->qp_dev->qp_table[n],
313 					       lockdep_is_held(ql));
314 		RCU_INIT_POINTER(rdi->qp_dev->qp_table[n], NULL);
315 
316 		for (; qp; qp = rcu_dereference_protected(qp->next,
317 							  lockdep_is_held(ql)))
318 			qp_inuse++;
319 	}
320 	spin_unlock_irqrestore(ql, flags);
321 	synchronize_rcu();
322 	return qp_inuse;
323 }
324 
325 /**
326  * rvt_qp_exit - clean up qps on device exit
327  * @rdi: rvt dev structure
328  *
329  * Check for qp leaks and free resources.
330  */
331 void rvt_qp_exit(struct rvt_dev_info *rdi)
332 {
333 	u32 qps_inuse = rvt_free_all_qps(rdi);
334 
335 	if (qps_inuse)
336 		rvt_pr_err(rdi, "QP memory leak! %u still in use\n",
337 			   qps_inuse);
338 	if (!rdi->qp_dev)
339 		return;
340 
341 	kfree(rdi->qp_dev->qp_table);
342 	free_qpn_table(&rdi->qp_dev->qpn_table);
343 	kfree(rdi->qp_dev);
344 }
345 
346 static inline unsigned mk_qpn(struct rvt_qpn_table *qpt,
347 			      struct rvt_qpn_map *map, unsigned off)
348 {
349 	return (map - qpt->map) * RVT_BITS_PER_PAGE + off;
350 }
351 
352 /**
353  * alloc_qpn - Allocate the next available qpn or zero/one for QP type
354  *	       IB_QPT_SMI/IB_QPT_GSI
355  *@rdi:	rvt device info structure
356  *@qpt: queue pair number table pointer
357  *@port_num: IB port number, 1 based, comes from core
358  *
359  * Return: The queue pair number
360  */
361 static int alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt,
362 		     enum ib_qp_type type, u8 port_num, gfp_t gfp)
363 {
364 	u32 i, offset, max_scan, qpn;
365 	struct rvt_qpn_map *map;
366 	u32 ret;
367 
368 	if (rdi->driver_f.alloc_qpn)
369 		return rdi->driver_f.alloc_qpn(rdi, qpt, type, port_num, gfp);
370 
371 	if (type == IB_QPT_SMI || type == IB_QPT_GSI) {
372 		unsigned n;
373 
374 		ret = type == IB_QPT_GSI;
375 		n = 1 << (ret + 2 * (port_num - 1));
376 		spin_lock(&qpt->lock);
377 		if (qpt->flags & n)
378 			ret = -EINVAL;
379 		else
380 			qpt->flags |= n;
381 		spin_unlock(&qpt->lock);
382 		goto bail;
383 	}
384 
385 	qpn = qpt->last + qpt->incr;
386 	if (qpn >= RVT_QPN_MAX)
387 		qpn = qpt->incr | ((qpt->last & 1) ^ 1);
388 	/* offset carries bit 0 */
389 	offset = qpn & RVT_BITS_PER_PAGE_MASK;
390 	map = &qpt->map[qpn / RVT_BITS_PER_PAGE];
391 	max_scan = qpt->nmaps - !offset;
392 	for (i = 0;;) {
393 		if (unlikely(!map->page)) {
394 			get_map_page(qpt, map, gfp);
395 			if (unlikely(!map->page))
396 				break;
397 		}
398 		do {
399 			if (!test_and_set_bit(offset, map->page)) {
400 				qpt->last = qpn;
401 				ret = qpn;
402 				goto bail;
403 			}
404 			offset += qpt->incr;
405 			/*
406 			 * This qpn might be bogus if offset >= BITS_PER_PAGE.
407 			 * That is OK.   It gets re-assigned below
408 			 */
409 			qpn = mk_qpn(qpt, map, offset);
410 		} while (offset < RVT_BITS_PER_PAGE && qpn < RVT_QPN_MAX);
411 		/*
412 		 * In order to keep the number of pages allocated to a
413 		 * minimum, we scan the all existing pages before increasing
414 		 * the size of the bitmap table.
415 		 */
416 		if (++i > max_scan) {
417 			if (qpt->nmaps == RVT_QPNMAP_ENTRIES)
418 				break;
419 			map = &qpt->map[qpt->nmaps++];
420 			/* start at incr with current bit 0 */
421 			offset = qpt->incr | (offset & 1);
422 		} else if (map < &qpt->map[qpt->nmaps]) {
423 			++map;
424 			/* start at incr with current bit 0 */
425 			offset = qpt->incr | (offset & 1);
426 		} else {
427 			map = &qpt->map[0];
428 			/* wrap to first map page, invert bit 0 */
429 			offset = qpt->incr | ((offset & 1) ^ 1);
430 		}
431 		/* there can be no set bits in low-order QoS bits */
432 		WARN_ON(offset & (BIT(rdi->dparms.qos_shift) - 1));
433 		qpn = mk_qpn(qpt, map, offset);
434 	}
435 
436 	ret = -ENOMEM;
437 
438 bail:
439 	return ret;
440 }
441 
442 static void free_qpn(struct rvt_qpn_table *qpt, u32 qpn)
443 {
444 	struct rvt_qpn_map *map;
445 
446 	map = qpt->map + qpn / RVT_BITS_PER_PAGE;
447 	if (map->page)
448 		clear_bit(qpn & RVT_BITS_PER_PAGE_MASK, map->page);
449 }
450 
451 /**
452  * rvt_clear_mr_refs - Drop help mr refs
453  * @qp: rvt qp data structure
454  * @clr_sends: If shoudl clear send side or not
455  */
456 static void rvt_clear_mr_refs(struct rvt_qp *qp, int clr_sends)
457 {
458 	unsigned n;
459 	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
460 
461 	if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags))
462 		rvt_put_ss(&qp->s_rdma_read_sge);
463 
464 	rvt_put_ss(&qp->r_sge);
465 
466 	if (clr_sends) {
467 		while (qp->s_last != qp->s_head) {
468 			struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_last);
469 			unsigned i;
470 
471 			for (i = 0; i < wqe->wr.num_sge; i++) {
472 				struct rvt_sge *sge = &wqe->sg_list[i];
473 
474 				rvt_put_mr(sge->mr);
475 			}
476 			if (qp->ibqp.qp_type == IB_QPT_UD ||
477 			    qp->ibqp.qp_type == IB_QPT_SMI ||
478 			    qp->ibqp.qp_type == IB_QPT_GSI)
479 				atomic_dec(&ibah_to_rvtah(
480 						wqe->ud_wr.ah)->refcount);
481 			if (++qp->s_last >= qp->s_size)
482 				qp->s_last = 0;
483 			smp_wmb(); /* see qp_set_savail */
484 		}
485 		if (qp->s_rdma_mr) {
486 			rvt_put_mr(qp->s_rdma_mr);
487 			qp->s_rdma_mr = NULL;
488 		}
489 	}
490 
491 	if (qp->ibqp.qp_type != IB_QPT_RC)
492 		return;
493 
494 	for (n = 0; n < rvt_max_atomic(rdi); n++) {
495 		struct rvt_ack_entry *e = &qp->s_ack_queue[n];
496 
497 		if (e->rdma_sge.mr) {
498 			rvt_put_mr(e->rdma_sge.mr);
499 			e->rdma_sge.mr = NULL;
500 		}
501 	}
502 }
503 
504 /**
505  * rvt_remove_qp - remove qp form table
506  * @rdi: rvt dev struct
507  * @qp: qp to remove
508  *
509  * Remove the QP from the table so it can't be found asynchronously by
510  * the receive routine.
511  */
512 static void rvt_remove_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp)
513 {
514 	struct rvt_ibport *rvp = rdi->ports[qp->port_num - 1];
515 	u32 n = hash_32(qp->ibqp.qp_num, rdi->qp_dev->qp_table_bits);
516 	unsigned long flags;
517 	int removed = 1;
518 
519 	spin_lock_irqsave(&rdi->qp_dev->qpt_lock, flags);
520 
521 	if (rcu_dereference_protected(rvp->qp[0],
522 			lockdep_is_held(&rdi->qp_dev->qpt_lock)) == qp) {
523 		RCU_INIT_POINTER(rvp->qp[0], NULL);
524 	} else if (rcu_dereference_protected(rvp->qp[1],
525 			lockdep_is_held(&rdi->qp_dev->qpt_lock)) == qp) {
526 		RCU_INIT_POINTER(rvp->qp[1], NULL);
527 	} else {
528 		struct rvt_qp *q;
529 		struct rvt_qp __rcu **qpp;
530 
531 		removed = 0;
532 		qpp = &rdi->qp_dev->qp_table[n];
533 		for (; (q = rcu_dereference_protected(*qpp,
534 			lockdep_is_held(&rdi->qp_dev->qpt_lock))) != NULL;
535 			qpp = &q->next) {
536 			if (q == qp) {
537 				RCU_INIT_POINTER(*qpp,
538 				     rcu_dereference_protected(qp->next,
539 				     lockdep_is_held(&rdi->qp_dev->qpt_lock)));
540 				removed = 1;
541 				trace_rvt_qpremove(qp, n);
542 				break;
543 			}
544 		}
545 	}
546 
547 	spin_unlock_irqrestore(&rdi->qp_dev->qpt_lock, flags);
548 	if (removed) {
549 		synchronize_rcu();
550 		rvt_put_qp(qp);
551 	}
552 }
553 
554 /**
555  * rvt_init_qp - initialize the QP state to the reset state
556  * @qp: the QP to init or reinit
557  * @type: the QP type
558  *
559  * This function is called from both rvt_create_qp() and
560  * rvt_reset_qp().   The difference is that the reset
561  * patch the necessary locks to protect against concurent
562  * access.
563  */
564 static void rvt_init_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
565 			enum ib_qp_type type)
566 {
567 	qp->remote_qpn = 0;
568 	qp->qkey = 0;
569 	qp->qp_access_flags = 0;
570 	qp->s_flags &= RVT_S_SIGNAL_REQ_WR;
571 	qp->s_hdrwords = 0;
572 	qp->s_wqe = NULL;
573 	qp->s_draining = 0;
574 	qp->s_next_psn = 0;
575 	qp->s_last_psn = 0;
576 	qp->s_sending_psn = 0;
577 	qp->s_sending_hpsn = 0;
578 	qp->s_psn = 0;
579 	qp->r_psn = 0;
580 	qp->r_msn = 0;
581 	if (type == IB_QPT_RC) {
582 		qp->s_state = IB_OPCODE_RC_SEND_LAST;
583 		qp->r_state = IB_OPCODE_RC_SEND_LAST;
584 	} else {
585 		qp->s_state = IB_OPCODE_UC_SEND_LAST;
586 		qp->r_state = IB_OPCODE_UC_SEND_LAST;
587 	}
588 	qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
589 	qp->r_nak_state = 0;
590 	qp->r_aflags = 0;
591 	qp->r_flags = 0;
592 	qp->s_head = 0;
593 	qp->s_tail = 0;
594 	qp->s_cur = 0;
595 	qp->s_acked = 0;
596 	qp->s_last = 0;
597 	qp->s_ssn = 1;
598 	qp->s_lsn = 0;
599 	qp->s_mig_state = IB_MIG_MIGRATED;
600 	qp->r_head_ack_queue = 0;
601 	qp->s_tail_ack_queue = 0;
602 	qp->s_num_rd_atomic = 0;
603 	if (qp->r_rq.wq) {
604 		qp->r_rq.wq->head = 0;
605 		qp->r_rq.wq->tail = 0;
606 	}
607 	qp->r_sge.num_sge = 0;
608 	atomic_set(&qp->s_reserved_used, 0);
609 }
610 
611 /**
612  * rvt_reset_qp - initialize the QP state to the reset state
613  * @qp: the QP to reset
614  * @type: the QP type
615  *
616  * r_lock, s_hlock, and s_lock are required to be held by the caller
617  */
618 static void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
619 			 enum ib_qp_type type)
620 	__must_hold(&qp->s_lock)
621 	__must_hold(&qp->s_hlock)
622 	__must_hold(&qp->r_lock)
623 {
624 	lockdep_assert_held(&qp->r_lock);
625 	lockdep_assert_held(&qp->s_hlock);
626 	lockdep_assert_held(&qp->s_lock);
627 	if (qp->state != IB_QPS_RESET) {
628 		qp->state = IB_QPS_RESET;
629 
630 		/* Let drivers flush their waitlist */
631 		rdi->driver_f.flush_qp_waiters(qp);
632 		rvt_stop_rc_timers(qp);
633 		qp->s_flags &= ~(RVT_S_TIMER | RVT_S_ANY_WAIT);
634 		spin_unlock(&qp->s_lock);
635 		spin_unlock(&qp->s_hlock);
636 		spin_unlock_irq(&qp->r_lock);
637 
638 		/* Stop the send queue and the retry timer */
639 		rdi->driver_f.stop_send_queue(qp);
640 		rvt_del_timers_sync(qp);
641 		/* Wait for things to stop */
642 		rdi->driver_f.quiesce_qp(qp);
643 
644 		/* take qp out the hash and wait for it to be unused */
645 		rvt_remove_qp(rdi, qp);
646 		wait_event(qp->wait, !atomic_read(&qp->refcount));
647 
648 		/* grab the lock b/c it was locked at call time */
649 		spin_lock_irq(&qp->r_lock);
650 		spin_lock(&qp->s_hlock);
651 		spin_lock(&qp->s_lock);
652 
653 		rvt_clear_mr_refs(qp, 1);
654 		/*
655 		 * Let the driver do any tear down or re-init it needs to for
656 		 * a qp that has been reset
657 		 */
658 		rdi->driver_f.notify_qp_reset(qp);
659 	}
660 	rvt_init_qp(rdi, qp, type);
661 	lockdep_assert_held(&qp->r_lock);
662 	lockdep_assert_held(&qp->s_hlock);
663 	lockdep_assert_held(&qp->s_lock);
664 }
665 
666 /**
667  * rvt_create_qp - create a queue pair for a device
668  * @ibpd: the protection domain who's device we create the queue pair for
669  * @init_attr: the attributes of the queue pair
670  * @udata: user data for libibverbs.so
671  *
672  * Queue pair creation is mostly an rvt issue. However, drivers have their own
673  * unique idea of what queue pair numbers mean. For instance there is a reserved
674  * range for PSM.
675  *
676  * Return: the queue pair on success, otherwise returns an errno.
677  *
678  * Called by the ib_create_qp() core verbs function.
679  */
680 struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
681 			    struct ib_qp_init_attr *init_attr,
682 			    struct ib_udata *udata)
683 {
684 	struct rvt_qp *qp;
685 	int err;
686 	struct rvt_swqe *swq = NULL;
687 	size_t sz;
688 	size_t sg_list_sz;
689 	struct ib_qp *ret = ERR_PTR(-ENOMEM);
690 	struct rvt_dev_info *rdi = ib_to_rvt(ibpd->device);
691 	void *priv = NULL;
692 	gfp_t gfp;
693 	size_t sqsize;
694 
695 	if (!rdi)
696 		return ERR_PTR(-EINVAL);
697 
698 	if (init_attr->cap.max_send_sge > rdi->dparms.props.max_sge ||
699 	    init_attr->cap.max_send_wr > rdi->dparms.props.max_qp_wr ||
700 	    init_attr->create_flags & ~(IB_QP_CREATE_USE_GFP_NOIO))
701 		return ERR_PTR(-EINVAL);
702 
703 	/* GFP_NOIO is applicable to RC QP's only */
704 
705 	if (init_attr->create_flags & IB_QP_CREATE_USE_GFP_NOIO &&
706 	    init_attr->qp_type != IB_QPT_RC)
707 		return ERR_PTR(-EINVAL);
708 
709 	gfp = init_attr->create_flags & IB_QP_CREATE_USE_GFP_NOIO ?
710 						GFP_NOIO : GFP_KERNEL;
711 
712 	/* Check receive queue parameters if no SRQ is specified. */
713 	if (!init_attr->srq) {
714 		if (init_attr->cap.max_recv_sge > rdi->dparms.props.max_sge ||
715 		    init_attr->cap.max_recv_wr > rdi->dparms.props.max_qp_wr)
716 			return ERR_PTR(-EINVAL);
717 
718 		if (init_attr->cap.max_send_sge +
719 		    init_attr->cap.max_send_wr +
720 		    init_attr->cap.max_recv_sge +
721 		    init_attr->cap.max_recv_wr == 0)
722 			return ERR_PTR(-EINVAL);
723 	}
724 	sqsize =
725 		init_attr->cap.max_send_wr + 1 +
726 		rdi->dparms.reserved_operations;
727 	switch (init_attr->qp_type) {
728 	case IB_QPT_SMI:
729 	case IB_QPT_GSI:
730 		if (init_attr->port_num == 0 ||
731 		    init_attr->port_num > ibpd->device->phys_port_cnt)
732 			return ERR_PTR(-EINVAL);
733 	case IB_QPT_UC:
734 	case IB_QPT_RC:
735 	case IB_QPT_UD:
736 		sz = sizeof(struct rvt_sge) *
737 			init_attr->cap.max_send_sge +
738 			sizeof(struct rvt_swqe);
739 		if (gfp == GFP_NOIO)
740 			swq = __vmalloc(
741 				sqsize * sz,
742 				gfp | __GFP_ZERO, PAGE_KERNEL);
743 		else
744 			swq = vzalloc_node(
745 				sqsize * sz,
746 				rdi->dparms.node);
747 		if (!swq)
748 			return ERR_PTR(-ENOMEM);
749 
750 		sz = sizeof(*qp);
751 		sg_list_sz = 0;
752 		if (init_attr->srq) {
753 			struct rvt_srq *srq = ibsrq_to_rvtsrq(init_attr->srq);
754 
755 			if (srq->rq.max_sge > 1)
756 				sg_list_sz = sizeof(*qp->r_sg_list) *
757 					(srq->rq.max_sge - 1);
758 		} else if (init_attr->cap.max_recv_sge > 1)
759 			sg_list_sz = sizeof(*qp->r_sg_list) *
760 				(init_attr->cap.max_recv_sge - 1);
761 		qp = kzalloc_node(sz + sg_list_sz, gfp, rdi->dparms.node);
762 		if (!qp)
763 			goto bail_swq;
764 
765 		RCU_INIT_POINTER(qp->next, NULL);
766 		if (init_attr->qp_type == IB_QPT_RC) {
767 			qp->s_ack_queue =
768 				kzalloc_node(
769 					sizeof(*qp->s_ack_queue) *
770 					 rvt_max_atomic(rdi),
771 					gfp,
772 					rdi->dparms.node);
773 			if (!qp->s_ack_queue)
774 				goto bail_qp;
775 		}
776 		/* initialize timers needed for rc qp */
777 		setup_timer(&qp->s_timer, rvt_rc_timeout, (unsigned long)qp);
778 		hrtimer_init(&qp->s_rnr_timer, CLOCK_MONOTONIC,
779 			     HRTIMER_MODE_REL);
780 		qp->s_rnr_timer.function = rvt_rc_rnr_retry;
781 
782 		/*
783 		 * Driver needs to set up it's private QP structure and do any
784 		 * initialization that is needed.
785 		 */
786 		priv = rdi->driver_f.qp_priv_alloc(rdi, qp, gfp);
787 		if (IS_ERR(priv)) {
788 			ret = priv;
789 			goto bail_qp;
790 		}
791 		qp->priv = priv;
792 		qp->timeout_jiffies =
793 			usecs_to_jiffies((4096UL * (1UL << qp->timeout)) /
794 				1000UL);
795 		if (init_attr->srq) {
796 			sz = 0;
797 		} else {
798 			qp->r_rq.size = init_attr->cap.max_recv_wr + 1;
799 			qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
800 			sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) +
801 				sizeof(struct rvt_rwqe);
802 			if (udata)
803 				qp->r_rq.wq = vmalloc_user(
804 						sizeof(struct rvt_rwq) +
805 						qp->r_rq.size * sz);
806 			else if (gfp == GFP_NOIO)
807 				qp->r_rq.wq = __vmalloc(
808 						sizeof(struct rvt_rwq) +
809 						qp->r_rq.size * sz,
810 						gfp | __GFP_ZERO, PAGE_KERNEL);
811 			else
812 				qp->r_rq.wq = vzalloc_node(
813 						sizeof(struct rvt_rwq) +
814 						qp->r_rq.size * sz,
815 						rdi->dparms.node);
816 			if (!qp->r_rq.wq)
817 				goto bail_driver_priv;
818 		}
819 
820 		/*
821 		 * ib_create_qp() will initialize qp->ibqp
822 		 * except for qp->ibqp.qp_num.
823 		 */
824 		spin_lock_init(&qp->r_lock);
825 		spin_lock_init(&qp->s_hlock);
826 		spin_lock_init(&qp->s_lock);
827 		spin_lock_init(&qp->r_rq.lock);
828 		atomic_set(&qp->refcount, 0);
829 		atomic_set(&qp->local_ops_pending, 0);
830 		init_waitqueue_head(&qp->wait);
831 		init_timer(&qp->s_timer);
832 		qp->s_timer.data = (unsigned long)qp;
833 		INIT_LIST_HEAD(&qp->rspwait);
834 		qp->state = IB_QPS_RESET;
835 		qp->s_wq = swq;
836 		qp->s_size = sqsize;
837 		qp->s_avail = init_attr->cap.max_send_wr;
838 		qp->s_max_sge = init_attr->cap.max_send_sge;
839 		if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR)
840 			qp->s_flags = RVT_S_SIGNAL_REQ_WR;
841 
842 		err = alloc_qpn(rdi, &rdi->qp_dev->qpn_table,
843 				init_attr->qp_type,
844 				init_attr->port_num, gfp);
845 		if (err < 0) {
846 			ret = ERR_PTR(err);
847 			goto bail_rq_wq;
848 		}
849 		qp->ibqp.qp_num = err;
850 		qp->port_num = init_attr->port_num;
851 		rvt_init_qp(rdi, qp, init_attr->qp_type);
852 		break;
853 
854 	default:
855 		/* Don't support raw QPs */
856 		return ERR_PTR(-EINVAL);
857 	}
858 
859 	init_attr->cap.max_inline_data = 0;
860 
861 	/*
862 	 * Return the address of the RWQ as the offset to mmap.
863 	 * See rvt_mmap() for details.
864 	 */
865 	if (udata && udata->outlen >= sizeof(__u64)) {
866 		if (!qp->r_rq.wq) {
867 			__u64 offset = 0;
868 
869 			err = ib_copy_to_udata(udata, &offset,
870 					       sizeof(offset));
871 			if (err) {
872 				ret = ERR_PTR(err);
873 				goto bail_qpn;
874 			}
875 		} else {
876 			u32 s = sizeof(struct rvt_rwq) + qp->r_rq.size * sz;
877 
878 			qp->ip = rvt_create_mmap_info(rdi, s,
879 						      ibpd->uobject->context,
880 						      qp->r_rq.wq);
881 			if (!qp->ip) {
882 				ret = ERR_PTR(-ENOMEM);
883 				goto bail_qpn;
884 			}
885 
886 			err = ib_copy_to_udata(udata, &qp->ip->offset,
887 					       sizeof(qp->ip->offset));
888 			if (err) {
889 				ret = ERR_PTR(err);
890 				goto bail_ip;
891 			}
892 		}
893 		qp->pid = current->pid;
894 	}
895 
896 	spin_lock(&rdi->n_qps_lock);
897 	if (rdi->n_qps_allocated == rdi->dparms.props.max_qp) {
898 		spin_unlock(&rdi->n_qps_lock);
899 		ret = ERR_PTR(-ENOMEM);
900 		goto bail_ip;
901 	}
902 
903 	rdi->n_qps_allocated++;
904 	/*
905 	 * Maintain a busy_jiffies variable that will be added to the timeout
906 	 * period in mod_retry_timer and add_retry_timer. This busy jiffies
907 	 * is scaled by the number of rc qps created for the device to reduce
908 	 * the number of timeouts occurring when there is a large number of
909 	 * qps. busy_jiffies is incremented every rc qp scaling interval.
910 	 * The scaling interval is selected based on extensive performance
911 	 * evaluation of targeted workloads.
912 	 */
913 	if (init_attr->qp_type == IB_QPT_RC) {
914 		rdi->n_rc_qps++;
915 		rdi->busy_jiffies = rdi->n_rc_qps / RC_QP_SCALING_INTERVAL;
916 	}
917 	spin_unlock(&rdi->n_qps_lock);
918 
919 	if (qp->ip) {
920 		spin_lock_irq(&rdi->pending_lock);
921 		list_add(&qp->ip->pending_mmaps, &rdi->pending_mmaps);
922 		spin_unlock_irq(&rdi->pending_lock);
923 	}
924 
925 	ret = &qp->ibqp;
926 
927 	/*
928 	 * We have our QP and its good, now keep track of what types of opcodes
929 	 * can be processed on this QP. We do this by keeping track of what the
930 	 * 3 high order bits of the opcode are.
931 	 */
932 	switch (init_attr->qp_type) {
933 	case IB_QPT_SMI:
934 	case IB_QPT_GSI:
935 	case IB_QPT_UD:
936 		qp->allowed_ops = IB_OPCODE_UD;
937 		break;
938 	case IB_QPT_RC:
939 		qp->allowed_ops = IB_OPCODE_RC;
940 		break;
941 	case IB_QPT_UC:
942 		qp->allowed_ops = IB_OPCODE_UC;
943 		break;
944 	default:
945 		ret = ERR_PTR(-EINVAL);
946 		goto bail_ip;
947 	}
948 
949 	return ret;
950 
951 bail_ip:
952 	if (qp->ip)
953 		kref_put(&qp->ip->ref, rvt_release_mmap_info);
954 
955 bail_qpn:
956 	free_qpn(&rdi->qp_dev->qpn_table, qp->ibqp.qp_num);
957 
958 bail_rq_wq:
959 	if (!qp->ip)
960 		vfree(qp->r_rq.wq);
961 
962 bail_driver_priv:
963 	rdi->driver_f.qp_priv_free(rdi, qp);
964 
965 bail_qp:
966 	kfree(qp->s_ack_queue);
967 	kfree(qp);
968 
969 bail_swq:
970 	vfree(swq);
971 
972 	return ret;
973 }
974 
975 /**
976  * rvt_error_qp - put a QP into the error state
977  * @qp: the QP to put into the error state
978  * @err: the receive completion error to signal if a RWQE is active
979  *
980  * Flushes both send and receive work queues.
981  *
982  * Return: true if last WQE event should be generated.
983  * The QP r_lock and s_lock should be held and interrupts disabled.
984  * If we are already in error state, just return.
985  */
986 int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err)
987 {
988 	struct ib_wc wc;
989 	int ret = 0;
990 	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
991 
992 	lockdep_assert_held(&qp->r_lock);
993 	lockdep_assert_held(&qp->s_lock);
994 	if (qp->state == IB_QPS_ERR || qp->state == IB_QPS_RESET)
995 		goto bail;
996 
997 	qp->state = IB_QPS_ERR;
998 
999 	if (qp->s_flags & (RVT_S_TIMER | RVT_S_WAIT_RNR)) {
1000 		qp->s_flags &= ~(RVT_S_TIMER | RVT_S_WAIT_RNR);
1001 		del_timer(&qp->s_timer);
1002 	}
1003 
1004 	if (qp->s_flags & RVT_S_ANY_WAIT_SEND)
1005 		qp->s_flags &= ~RVT_S_ANY_WAIT_SEND;
1006 
1007 	rdi->driver_f.notify_error_qp(qp);
1008 
1009 	/* Schedule the sending tasklet to drain the send work queue. */
1010 	if (ACCESS_ONCE(qp->s_last) != qp->s_head)
1011 		rdi->driver_f.schedule_send(qp);
1012 
1013 	rvt_clear_mr_refs(qp, 0);
1014 
1015 	memset(&wc, 0, sizeof(wc));
1016 	wc.qp = &qp->ibqp;
1017 	wc.opcode = IB_WC_RECV;
1018 
1019 	if (test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) {
1020 		wc.wr_id = qp->r_wr_id;
1021 		wc.status = err;
1022 		rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
1023 	}
1024 	wc.status = IB_WC_WR_FLUSH_ERR;
1025 
1026 	if (qp->r_rq.wq) {
1027 		struct rvt_rwq *wq;
1028 		u32 head;
1029 		u32 tail;
1030 
1031 		spin_lock(&qp->r_rq.lock);
1032 
1033 		/* sanity check pointers before trusting them */
1034 		wq = qp->r_rq.wq;
1035 		head = wq->head;
1036 		if (head >= qp->r_rq.size)
1037 			head = 0;
1038 		tail = wq->tail;
1039 		if (tail >= qp->r_rq.size)
1040 			tail = 0;
1041 		while (tail != head) {
1042 			wc.wr_id = rvt_get_rwqe_ptr(&qp->r_rq, tail)->wr_id;
1043 			if (++tail >= qp->r_rq.size)
1044 				tail = 0;
1045 			rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
1046 		}
1047 		wq->tail = tail;
1048 
1049 		spin_unlock(&qp->r_rq.lock);
1050 	} else if (qp->ibqp.event_handler) {
1051 		ret = 1;
1052 	}
1053 
1054 bail:
1055 	return ret;
1056 }
1057 EXPORT_SYMBOL(rvt_error_qp);
1058 
1059 /*
1060  * Put the QP into the hash table.
1061  * The hash table holds a reference to the QP.
1062  */
1063 static void rvt_insert_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp)
1064 {
1065 	struct rvt_ibport *rvp = rdi->ports[qp->port_num - 1];
1066 	unsigned long flags;
1067 
1068 	rvt_get_qp(qp);
1069 	spin_lock_irqsave(&rdi->qp_dev->qpt_lock, flags);
1070 
1071 	if (qp->ibqp.qp_num <= 1) {
1072 		rcu_assign_pointer(rvp->qp[qp->ibqp.qp_num], qp);
1073 	} else {
1074 		u32 n = hash_32(qp->ibqp.qp_num, rdi->qp_dev->qp_table_bits);
1075 
1076 		qp->next = rdi->qp_dev->qp_table[n];
1077 		rcu_assign_pointer(rdi->qp_dev->qp_table[n], qp);
1078 		trace_rvt_qpinsert(qp, n);
1079 	}
1080 
1081 	spin_unlock_irqrestore(&rdi->qp_dev->qpt_lock, flags);
1082 }
1083 
1084 /**
1085  * rvt_modify_qp - modify the attributes of a queue pair
1086  * @ibqp: the queue pair who's attributes we're modifying
1087  * @attr: the new attributes
1088  * @attr_mask: the mask of attributes to modify
1089  * @udata: user data for libibverbs.so
1090  *
1091  * Return: 0 on success, otherwise returns an errno.
1092  */
1093 int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
1094 		  int attr_mask, struct ib_udata *udata)
1095 {
1096 	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
1097 	struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
1098 	enum ib_qp_state cur_state, new_state;
1099 	struct ib_event ev;
1100 	int lastwqe = 0;
1101 	int mig = 0;
1102 	int pmtu = 0; /* for gcc warning only */
1103 	enum rdma_link_layer link;
1104 
1105 	link = rdma_port_get_link_layer(ibqp->device, qp->port_num);
1106 
1107 	spin_lock_irq(&qp->r_lock);
1108 	spin_lock(&qp->s_hlock);
1109 	spin_lock(&qp->s_lock);
1110 
1111 	cur_state = attr_mask & IB_QP_CUR_STATE ?
1112 		attr->cur_qp_state : qp->state;
1113 	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
1114 
1115 	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
1116 				attr_mask, link))
1117 		goto inval;
1118 
1119 	if (rdi->driver_f.check_modify_qp &&
1120 	    rdi->driver_f.check_modify_qp(qp, attr, attr_mask, udata))
1121 		goto inval;
1122 
1123 	if (attr_mask & IB_QP_AV) {
1124 		if (attr->ah_attr.dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE))
1125 			goto inval;
1126 		if (rvt_check_ah(qp->ibqp.device, &attr->ah_attr))
1127 			goto inval;
1128 	}
1129 
1130 	if (attr_mask & IB_QP_ALT_PATH) {
1131 		if (attr->alt_ah_attr.dlid >=
1132 		    be16_to_cpu(IB_MULTICAST_LID_BASE))
1133 			goto inval;
1134 		if (rvt_check_ah(qp->ibqp.device, &attr->alt_ah_attr))
1135 			goto inval;
1136 		if (attr->alt_pkey_index >= rvt_get_npkeys(rdi))
1137 			goto inval;
1138 	}
1139 
1140 	if (attr_mask & IB_QP_PKEY_INDEX)
1141 		if (attr->pkey_index >= rvt_get_npkeys(rdi))
1142 			goto inval;
1143 
1144 	if (attr_mask & IB_QP_MIN_RNR_TIMER)
1145 		if (attr->min_rnr_timer > 31)
1146 			goto inval;
1147 
1148 	if (attr_mask & IB_QP_PORT)
1149 		if (qp->ibqp.qp_type == IB_QPT_SMI ||
1150 		    qp->ibqp.qp_type == IB_QPT_GSI ||
1151 		    attr->port_num == 0 ||
1152 		    attr->port_num > ibqp->device->phys_port_cnt)
1153 			goto inval;
1154 
1155 	if (attr_mask & IB_QP_DEST_QPN)
1156 		if (attr->dest_qp_num > RVT_QPN_MASK)
1157 			goto inval;
1158 
1159 	if (attr_mask & IB_QP_RETRY_CNT)
1160 		if (attr->retry_cnt > 7)
1161 			goto inval;
1162 
1163 	if (attr_mask & IB_QP_RNR_RETRY)
1164 		if (attr->rnr_retry > 7)
1165 			goto inval;
1166 
1167 	/*
1168 	 * Don't allow invalid path_mtu values.  OK to set greater
1169 	 * than the active mtu (or even the max_cap, if we have tuned
1170 	 * that to a small mtu.  We'll set qp->path_mtu
1171 	 * to the lesser of requested attribute mtu and active,
1172 	 * for packetizing messages.
1173 	 * Note that the QP port has to be set in INIT and MTU in RTR.
1174 	 */
1175 	if (attr_mask & IB_QP_PATH_MTU) {
1176 		pmtu = rdi->driver_f.get_pmtu_from_attr(rdi, qp, attr);
1177 		if (pmtu < 0)
1178 			goto inval;
1179 	}
1180 
1181 	if (attr_mask & IB_QP_PATH_MIG_STATE) {
1182 		if (attr->path_mig_state == IB_MIG_REARM) {
1183 			if (qp->s_mig_state == IB_MIG_ARMED)
1184 				goto inval;
1185 			if (new_state != IB_QPS_RTS)
1186 				goto inval;
1187 		} else if (attr->path_mig_state == IB_MIG_MIGRATED) {
1188 			if (qp->s_mig_state == IB_MIG_REARM)
1189 				goto inval;
1190 			if (new_state != IB_QPS_RTS && new_state != IB_QPS_SQD)
1191 				goto inval;
1192 			if (qp->s_mig_state == IB_MIG_ARMED)
1193 				mig = 1;
1194 		} else {
1195 			goto inval;
1196 		}
1197 	}
1198 
1199 	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
1200 		if (attr->max_dest_rd_atomic > rdi->dparms.max_rdma_atomic)
1201 			goto inval;
1202 
1203 	switch (new_state) {
1204 	case IB_QPS_RESET:
1205 		if (qp->state != IB_QPS_RESET)
1206 			rvt_reset_qp(rdi, qp, ibqp->qp_type);
1207 		break;
1208 
1209 	case IB_QPS_RTR:
1210 		/* Allow event to re-trigger if QP set to RTR more than once */
1211 		qp->r_flags &= ~RVT_R_COMM_EST;
1212 		qp->state = new_state;
1213 		break;
1214 
1215 	case IB_QPS_SQD:
1216 		qp->s_draining = qp->s_last != qp->s_cur;
1217 		qp->state = new_state;
1218 		break;
1219 
1220 	case IB_QPS_SQE:
1221 		if (qp->ibqp.qp_type == IB_QPT_RC)
1222 			goto inval;
1223 		qp->state = new_state;
1224 		break;
1225 
1226 	case IB_QPS_ERR:
1227 		lastwqe = rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
1228 		break;
1229 
1230 	default:
1231 		qp->state = new_state;
1232 		break;
1233 	}
1234 
1235 	if (attr_mask & IB_QP_PKEY_INDEX)
1236 		qp->s_pkey_index = attr->pkey_index;
1237 
1238 	if (attr_mask & IB_QP_PORT)
1239 		qp->port_num = attr->port_num;
1240 
1241 	if (attr_mask & IB_QP_DEST_QPN)
1242 		qp->remote_qpn = attr->dest_qp_num;
1243 
1244 	if (attr_mask & IB_QP_SQ_PSN) {
1245 		qp->s_next_psn = attr->sq_psn & rdi->dparms.psn_modify_mask;
1246 		qp->s_psn = qp->s_next_psn;
1247 		qp->s_sending_psn = qp->s_next_psn;
1248 		qp->s_last_psn = qp->s_next_psn - 1;
1249 		qp->s_sending_hpsn = qp->s_last_psn;
1250 	}
1251 
1252 	if (attr_mask & IB_QP_RQ_PSN)
1253 		qp->r_psn = attr->rq_psn & rdi->dparms.psn_modify_mask;
1254 
1255 	if (attr_mask & IB_QP_ACCESS_FLAGS)
1256 		qp->qp_access_flags = attr->qp_access_flags;
1257 
1258 	if (attr_mask & IB_QP_AV) {
1259 		qp->remote_ah_attr = attr->ah_attr;
1260 		qp->s_srate = attr->ah_attr.static_rate;
1261 		qp->srate_mbps = ib_rate_to_mbps(qp->s_srate);
1262 	}
1263 
1264 	if (attr_mask & IB_QP_ALT_PATH) {
1265 		qp->alt_ah_attr = attr->alt_ah_attr;
1266 		qp->s_alt_pkey_index = attr->alt_pkey_index;
1267 	}
1268 
1269 	if (attr_mask & IB_QP_PATH_MIG_STATE) {
1270 		qp->s_mig_state = attr->path_mig_state;
1271 		if (mig) {
1272 			qp->remote_ah_attr = qp->alt_ah_attr;
1273 			qp->port_num = qp->alt_ah_attr.port_num;
1274 			qp->s_pkey_index = qp->s_alt_pkey_index;
1275 		}
1276 	}
1277 
1278 	if (attr_mask & IB_QP_PATH_MTU) {
1279 		qp->pmtu = rdi->driver_f.mtu_from_qp(rdi, qp, pmtu);
1280 		qp->path_mtu = rdi->driver_f.mtu_to_path_mtu(qp->pmtu);
1281 		qp->log_pmtu = ilog2(qp->pmtu);
1282 	}
1283 
1284 	if (attr_mask & IB_QP_RETRY_CNT) {
1285 		qp->s_retry_cnt = attr->retry_cnt;
1286 		qp->s_retry = attr->retry_cnt;
1287 	}
1288 
1289 	if (attr_mask & IB_QP_RNR_RETRY) {
1290 		qp->s_rnr_retry_cnt = attr->rnr_retry;
1291 		qp->s_rnr_retry = attr->rnr_retry;
1292 	}
1293 
1294 	if (attr_mask & IB_QP_MIN_RNR_TIMER)
1295 		qp->r_min_rnr_timer = attr->min_rnr_timer;
1296 
1297 	if (attr_mask & IB_QP_TIMEOUT) {
1298 		qp->timeout = attr->timeout;
1299 		qp->timeout_jiffies =
1300 			usecs_to_jiffies((4096UL * (1UL << qp->timeout)) /
1301 				1000UL);
1302 	}
1303 
1304 	if (attr_mask & IB_QP_QKEY)
1305 		qp->qkey = attr->qkey;
1306 
1307 	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
1308 		qp->r_max_rd_atomic = attr->max_dest_rd_atomic;
1309 
1310 	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC)
1311 		qp->s_max_rd_atomic = attr->max_rd_atomic;
1312 
1313 	if (rdi->driver_f.modify_qp)
1314 		rdi->driver_f.modify_qp(qp, attr, attr_mask, udata);
1315 
1316 	spin_unlock(&qp->s_lock);
1317 	spin_unlock(&qp->s_hlock);
1318 	spin_unlock_irq(&qp->r_lock);
1319 
1320 	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
1321 		rvt_insert_qp(rdi, qp);
1322 
1323 	if (lastwqe) {
1324 		ev.device = qp->ibqp.device;
1325 		ev.element.qp = &qp->ibqp;
1326 		ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
1327 		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
1328 	}
1329 	if (mig) {
1330 		ev.device = qp->ibqp.device;
1331 		ev.element.qp = &qp->ibqp;
1332 		ev.event = IB_EVENT_PATH_MIG;
1333 		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
1334 	}
1335 	return 0;
1336 
1337 inval:
1338 	spin_unlock(&qp->s_lock);
1339 	spin_unlock(&qp->s_hlock);
1340 	spin_unlock_irq(&qp->r_lock);
1341 	return -EINVAL;
1342 }
1343 
1344 /** rvt_free_qpn - Free a qpn from the bit map
1345  * @qpt: QP table
1346  * @qpn: queue pair number to free
1347  */
1348 static void rvt_free_qpn(struct rvt_qpn_table *qpt, u32 qpn)
1349 {
1350 	struct rvt_qpn_map *map;
1351 
1352 	map = qpt->map + qpn / RVT_BITS_PER_PAGE;
1353 	if (map->page)
1354 		clear_bit(qpn & RVT_BITS_PER_PAGE_MASK, map->page);
1355 }
1356 
1357 /**
1358  * rvt_destroy_qp - destroy a queue pair
1359  * @ibqp: the queue pair to destroy
1360  *
1361  * Note that this can be called while the QP is actively sending or
1362  * receiving!
1363  *
1364  * Return: 0 on success.
1365  */
1366 int rvt_destroy_qp(struct ib_qp *ibqp)
1367 {
1368 	struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
1369 	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
1370 
1371 	spin_lock_irq(&qp->r_lock);
1372 	spin_lock(&qp->s_hlock);
1373 	spin_lock(&qp->s_lock);
1374 	rvt_reset_qp(rdi, qp, ibqp->qp_type);
1375 	spin_unlock(&qp->s_lock);
1376 	spin_unlock(&qp->s_hlock);
1377 	spin_unlock_irq(&qp->r_lock);
1378 
1379 	/* qpn is now available for use again */
1380 	rvt_free_qpn(&rdi->qp_dev->qpn_table, qp->ibqp.qp_num);
1381 
1382 	spin_lock(&rdi->n_qps_lock);
1383 	rdi->n_qps_allocated--;
1384 	if (qp->ibqp.qp_type == IB_QPT_RC) {
1385 		rdi->n_rc_qps--;
1386 		rdi->busy_jiffies = rdi->n_rc_qps / RC_QP_SCALING_INTERVAL;
1387 	}
1388 	spin_unlock(&rdi->n_qps_lock);
1389 
1390 	if (qp->ip)
1391 		kref_put(&qp->ip->ref, rvt_release_mmap_info);
1392 	else
1393 		vfree(qp->r_rq.wq);
1394 	vfree(qp->s_wq);
1395 	rdi->driver_f.qp_priv_free(rdi, qp);
1396 	kfree(qp->s_ack_queue);
1397 	kfree(qp);
1398 	return 0;
1399 }
1400 
1401 /**
1402  * rvt_query_qp - query an ipbq
1403  * @ibqp: IB qp to query
1404  * @attr: attr struct to fill in
1405  * @attr_mask: attr mask ignored
1406  * @init_attr: struct to fill in
1407  *
1408  * Return: always 0
1409  */
1410 int rvt_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
1411 		 int attr_mask, struct ib_qp_init_attr *init_attr)
1412 {
1413 	struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
1414 	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
1415 
1416 	attr->qp_state = qp->state;
1417 	attr->cur_qp_state = attr->qp_state;
1418 	attr->path_mtu = qp->path_mtu;
1419 	attr->path_mig_state = qp->s_mig_state;
1420 	attr->qkey = qp->qkey;
1421 	attr->rq_psn = qp->r_psn & rdi->dparms.psn_mask;
1422 	attr->sq_psn = qp->s_next_psn & rdi->dparms.psn_mask;
1423 	attr->dest_qp_num = qp->remote_qpn;
1424 	attr->qp_access_flags = qp->qp_access_flags;
1425 	attr->cap.max_send_wr = qp->s_size - 1 -
1426 		rdi->dparms.reserved_operations;
1427 	attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1;
1428 	attr->cap.max_send_sge = qp->s_max_sge;
1429 	attr->cap.max_recv_sge = qp->r_rq.max_sge;
1430 	attr->cap.max_inline_data = 0;
1431 	attr->ah_attr = qp->remote_ah_attr;
1432 	attr->alt_ah_attr = qp->alt_ah_attr;
1433 	attr->pkey_index = qp->s_pkey_index;
1434 	attr->alt_pkey_index = qp->s_alt_pkey_index;
1435 	attr->en_sqd_async_notify = 0;
1436 	attr->sq_draining = qp->s_draining;
1437 	attr->max_rd_atomic = qp->s_max_rd_atomic;
1438 	attr->max_dest_rd_atomic = qp->r_max_rd_atomic;
1439 	attr->min_rnr_timer = qp->r_min_rnr_timer;
1440 	attr->port_num = qp->port_num;
1441 	attr->timeout = qp->timeout;
1442 	attr->retry_cnt = qp->s_retry_cnt;
1443 	attr->rnr_retry = qp->s_rnr_retry_cnt;
1444 	attr->alt_port_num = qp->alt_ah_attr.port_num;
1445 	attr->alt_timeout = qp->alt_timeout;
1446 
1447 	init_attr->event_handler = qp->ibqp.event_handler;
1448 	init_attr->qp_context = qp->ibqp.qp_context;
1449 	init_attr->send_cq = qp->ibqp.send_cq;
1450 	init_attr->recv_cq = qp->ibqp.recv_cq;
1451 	init_attr->srq = qp->ibqp.srq;
1452 	init_attr->cap = attr->cap;
1453 	if (qp->s_flags & RVT_S_SIGNAL_REQ_WR)
1454 		init_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
1455 	else
1456 		init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
1457 	init_attr->qp_type = qp->ibqp.qp_type;
1458 	init_attr->port_num = qp->port_num;
1459 	return 0;
1460 }
1461 
1462 /**
1463  * rvt_post_receive - post a receive on a QP
1464  * @ibqp: the QP to post the receive on
1465  * @wr: the WR to post
1466  * @bad_wr: the first bad WR is put here
1467  *
1468  * This may be called from interrupt context.
1469  *
1470  * Return: 0 on success otherwise errno
1471  */
1472 int rvt_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
1473 		  struct ib_recv_wr **bad_wr)
1474 {
1475 	struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
1476 	struct rvt_rwq *wq = qp->r_rq.wq;
1477 	unsigned long flags;
1478 	int qp_err_flush = (ib_rvt_state_ops[qp->state] & RVT_FLUSH_RECV) &&
1479 				!qp->ibqp.srq;
1480 
1481 	/* Check that state is OK to post receive. */
1482 	if (!(ib_rvt_state_ops[qp->state] & RVT_POST_RECV_OK) || !wq) {
1483 		*bad_wr = wr;
1484 		return -EINVAL;
1485 	}
1486 
1487 	for (; wr; wr = wr->next) {
1488 		struct rvt_rwqe *wqe;
1489 		u32 next;
1490 		int i;
1491 
1492 		if ((unsigned)wr->num_sge > qp->r_rq.max_sge) {
1493 			*bad_wr = wr;
1494 			return -EINVAL;
1495 		}
1496 
1497 		spin_lock_irqsave(&qp->r_rq.lock, flags);
1498 		next = wq->head + 1;
1499 		if (next >= qp->r_rq.size)
1500 			next = 0;
1501 		if (next == wq->tail) {
1502 			spin_unlock_irqrestore(&qp->r_rq.lock, flags);
1503 			*bad_wr = wr;
1504 			return -ENOMEM;
1505 		}
1506 		if (unlikely(qp_err_flush)) {
1507 			struct ib_wc wc;
1508 
1509 			memset(&wc, 0, sizeof(wc));
1510 			wc.qp = &qp->ibqp;
1511 			wc.opcode = IB_WC_RECV;
1512 			wc.wr_id = wr->wr_id;
1513 			wc.status = IB_WC_WR_FLUSH_ERR;
1514 			rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
1515 		} else {
1516 			wqe = rvt_get_rwqe_ptr(&qp->r_rq, wq->head);
1517 			wqe->wr_id = wr->wr_id;
1518 			wqe->num_sge = wr->num_sge;
1519 			for (i = 0; i < wr->num_sge; i++)
1520 				wqe->sg_list[i] = wr->sg_list[i];
1521 			/*
1522 			 * Make sure queue entry is written
1523 			 * before the head index.
1524 			 */
1525 			smp_wmb();
1526 			wq->head = next;
1527 		}
1528 		spin_unlock_irqrestore(&qp->r_rq.lock, flags);
1529 	}
1530 	return 0;
1531 }
1532 
1533 /**
1534  * rvt_qp_valid_operation - validate post send wr request
1535  * @qp - the qp
1536  * @post-parms - the post send table for the driver
1537  * @wr - the work request
1538  *
1539  * The routine validates the operation based on the
1540  * validation table an returns the length of the operation
1541  * which can extend beyond the ib_send_bw.  Operation
1542  * dependent flags key atomic operation validation.
1543  *
1544  * There is an exception for UD qps that validates the pd and
1545  * overrides the length to include the additional UD specific
1546  * length.
1547  *
1548  * Returns a negative error or the length of the work request
1549  * for building the swqe.
1550  */
1551 static inline int rvt_qp_valid_operation(
1552 	struct rvt_qp *qp,
1553 	const struct rvt_operation_params *post_parms,
1554 	struct ib_send_wr *wr)
1555 {
1556 	int len;
1557 
1558 	if (wr->opcode >= RVT_OPERATION_MAX || !post_parms[wr->opcode].length)
1559 		return -EINVAL;
1560 	if (!(post_parms[wr->opcode].qpt_support & BIT(qp->ibqp.qp_type)))
1561 		return -EINVAL;
1562 	if ((post_parms[wr->opcode].flags & RVT_OPERATION_PRIV) &&
1563 	    ibpd_to_rvtpd(qp->ibqp.pd)->user)
1564 		return -EINVAL;
1565 	if (post_parms[wr->opcode].flags & RVT_OPERATION_ATOMIC_SGE &&
1566 	    (wr->num_sge == 0 ||
1567 	     wr->sg_list[0].length < sizeof(u64) ||
1568 	     wr->sg_list[0].addr & (sizeof(u64) - 1)))
1569 		return -EINVAL;
1570 	if (post_parms[wr->opcode].flags & RVT_OPERATION_ATOMIC &&
1571 	    !qp->s_max_rd_atomic)
1572 		return -EINVAL;
1573 	len = post_parms[wr->opcode].length;
1574 	/* UD specific */
1575 	if (qp->ibqp.qp_type != IB_QPT_UC &&
1576 	    qp->ibqp.qp_type != IB_QPT_RC) {
1577 		if (qp->ibqp.pd != ud_wr(wr)->ah->pd)
1578 			return -EINVAL;
1579 		len = sizeof(struct ib_ud_wr);
1580 	}
1581 	return len;
1582 }
1583 
1584 /**
1585  * rvt_qp_is_avail - determine queue capacity
1586  * @qp - the qp
1587  * @rdi - the rdmavt device
1588  * @reserved_op - is reserved operation
1589  *
1590  * This assumes the s_hlock is held but the s_last
1591  * qp variable is uncontrolled.
1592  *
1593  * For non reserved operations, the qp->s_avail
1594  * may be changed.
1595  *
1596  * The return value is zero or a -ENOMEM.
1597  */
1598 static inline int rvt_qp_is_avail(
1599 	struct rvt_qp *qp,
1600 	struct rvt_dev_info *rdi,
1601 	bool reserved_op)
1602 {
1603 	u32 slast;
1604 	u32 avail;
1605 	u32 reserved_used;
1606 
1607 	/* see rvt_qp_wqe_unreserve() */
1608 	smp_mb__before_atomic();
1609 	reserved_used = atomic_read(&qp->s_reserved_used);
1610 	if (unlikely(reserved_op)) {
1611 		/* see rvt_qp_wqe_unreserve() */
1612 		smp_mb__before_atomic();
1613 		if (reserved_used >= rdi->dparms.reserved_operations)
1614 			return -ENOMEM;
1615 		return 0;
1616 	}
1617 	/* non-reserved operations */
1618 	if (likely(qp->s_avail))
1619 		return 0;
1620 	smp_read_barrier_depends(); /* see rc.c */
1621 	slast = ACCESS_ONCE(qp->s_last);
1622 	if (qp->s_head >= slast)
1623 		avail = qp->s_size - (qp->s_head - slast);
1624 	else
1625 		avail = slast - qp->s_head;
1626 
1627 	/* see rvt_qp_wqe_unreserve() */
1628 	smp_mb__before_atomic();
1629 	reserved_used = atomic_read(&qp->s_reserved_used);
1630 	avail =  avail - 1 -
1631 		(rdi->dparms.reserved_operations - reserved_used);
1632 	/* insure we don't assign a negative s_avail */
1633 	if ((s32)avail <= 0)
1634 		return -ENOMEM;
1635 	qp->s_avail = avail;
1636 	if (WARN_ON(qp->s_avail >
1637 		    (qp->s_size - 1 - rdi->dparms.reserved_operations)))
1638 		rvt_pr_err(rdi,
1639 			   "More avail entries than QP RB size.\nQP: %u, size: %u, avail: %u\nhead: %u, tail: %u, cur: %u, acked: %u, last: %u",
1640 			   qp->ibqp.qp_num, qp->s_size, qp->s_avail,
1641 			   qp->s_head, qp->s_tail, qp->s_cur,
1642 			   qp->s_acked, qp->s_last);
1643 	return 0;
1644 }
1645 
1646 /**
1647  * rvt_post_one_wr - post one RC, UC, or UD send work request
1648  * @qp: the QP to post on
1649  * @wr: the work request to send
1650  */
1651 static int rvt_post_one_wr(struct rvt_qp *qp,
1652 			   struct ib_send_wr *wr,
1653 			   int *call_send)
1654 {
1655 	struct rvt_swqe *wqe;
1656 	u32 next;
1657 	int i;
1658 	int j;
1659 	int acc;
1660 	struct rvt_lkey_table *rkt;
1661 	struct rvt_pd *pd;
1662 	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
1663 	u8 log_pmtu;
1664 	int ret;
1665 	size_t cplen;
1666 	bool reserved_op;
1667 	int local_ops_delayed = 0;
1668 
1669 	BUILD_BUG_ON(IB_QPT_MAX >= (sizeof(u32) * BITS_PER_BYTE));
1670 
1671 	/* IB spec says that num_sge == 0 is OK. */
1672 	if (unlikely(wr->num_sge > qp->s_max_sge))
1673 		return -EINVAL;
1674 
1675 	ret = rvt_qp_valid_operation(qp, rdi->post_parms, wr);
1676 	if (ret < 0)
1677 		return ret;
1678 	cplen = ret;
1679 
1680 	/*
1681 	 * Local operations include fast register and local invalidate.
1682 	 * Fast register needs to be processed immediately because the
1683 	 * registered lkey may be used by following work requests and the
1684 	 * lkey needs to be valid at the time those requests are posted.
1685 	 * Local invalidate can be processed immediately if fencing is
1686 	 * not required and no previous local invalidate ops are pending.
1687 	 * Signaled local operations that have been processed immediately
1688 	 * need to have requests with "completion only" flags set posted
1689 	 * to the send queue in order to generate completions.
1690 	 */
1691 	if ((rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL)) {
1692 		switch (wr->opcode) {
1693 		case IB_WR_REG_MR:
1694 			ret = rvt_fast_reg_mr(qp,
1695 					      reg_wr(wr)->mr,
1696 					      reg_wr(wr)->key,
1697 					      reg_wr(wr)->access);
1698 			if (ret || !(wr->send_flags & IB_SEND_SIGNALED))
1699 				return ret;
1700 			break;
1701 		case IB_WR_LOCAL_INV:
1702 			if ((wr->send_flags & IB_SEND_FENCE) ||
1703 			    atomic_read(&qp->local_ops_pending)) {
1704 				local_ops_delayed = 1;
1705 			} else {
1706 				ret = rvt_invalidate_rkey(
1707 					qp, wr->ex.invalidate_rkey);
1708 				if (ret || !(wr->send_flags & IB_SEND_SIGNALED))
1709 					return ret;
1710 			}
1711 			break;
1712 		default:
1713 			return -EINVAL;
1714 		}
1715 	}
1716 
1717 	reserved_op = rdi->post_parms[wr->opcode].flags &
1718 			RVT_OPERATION_USE_RESERVE;
1719 	/* check for avail */
1720 	ret = rvt_qp_is_avail(qp, rdi, reserved_op);
1721 	if (ret)
1722 		return ret;
1723 	next = qp->s_head + 1;
1724 	if (next >= qp->s_size)
1725 		next = 0;
1726 
1727 	rkt = &rdi->lkey_table;
1728 	pd = ibpd_to_rvtpd(qp->ibqp.pd);
1729 	wqe = rvt_get_swqe_ptr(qp, qp->s_head);
1730 
1731 	/* cplen has length from above */
1732 	memcpy(&wqe->wr, wr, cplen);
1733 
1734 	wqe->length = 0;
1735 	j = 0;
1736 	if (wr->num_sge) {
1737 		acc = wr->opcode >= IB_WR_RDMA_READ ?
1738 			IB_ACCESS_LOCAL_WRITE : 0;
1739 		for (i = 0; i < wr->num_sge; i++) {
1740 			u32 length = wr->sg_list[i].length;
1741 			int ok;
1742 
1743 			if (length == 0)
1744 				continue;
1745 			ok = rvt_lkey_ok(rkt, pd, &wqe->sg_list[j],
1746 					 &wr->sg_list[i], acc);
1747 			if (!ok) {
1748 				ret = -EINVAL;
1749 				goto bail_inval_free;
1750 			}
1751 			wqe->length += length;
1752 			j++;
1753 		}
1754 		wqe->wr.num_sge = j;
1755 	}
1756 
1757 	/* general part of wqe valid - allow for driver checks */
1758 	if (rdi->driver_f.check_send_wqe) {
1759 		ret = rdi->driver_f.check_send_wqe(qp, wqe);
1760 		if (ret < 0)
1761 			goto bail_inval_free;
1762 		if (ret)
1763 			*call_send = ret;
1764 	}
1765 
1766 	log_pmtu = qp->log_pmtu;
1767 	if (qp->ibqp.qp_type != IB_QPT_UC &&
1768 	    qp->ibqp.qp_type != IB_QPT_RC) {
1769 		struct rvt_ah *ah = ibah_to_rvtah(wqe->ud_wr.ah);
1770 
1771 		log_pmtu = ah->log_pmtu;
1772 		atomic_inc(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount);
1773 	}
1774 
1775 	if (rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL) {
1776 		if (local_ops_delayed)
1777 			atomic_inc(&qp->local_ops_pending);
1778 		else
1779 			wqe->wr.send_flags |= RVT_SEND_COMPLETION_ONLY;
1780 		wqe->ssn = 0;
1781 		wqe->psn = 0;
1782 		wqe->lpsn = 0;
1783 	} else {
1784 		wqe->ssn = qp->s_ssn++;
1785 		wqe->psn = qp->s_next_psn;
1786 		wqe->lpsn = wqe->psn +
1787 				(wqe->length ?
1788 					((wqe->length - 1) >> log_pmtu) :
1789 					0);
1790 		qp->s_next_psn = wqe->lpsn + 1;
1791 	}
1792 	trace_rvt_post_one_wr(qp, wqe);
1793 	if (unlikely(reserved_op))
1794 		rvt_qp_wqe_reserve(qp, wqe);
1795 	else
1796 		qp->s_avail--;
1797 	smp_wmb(); /* see request builders */
1798 	qp->s_head = next;
1799 
1800 	return 0;
1801 
1802 bail_inval_free:
1803 	/* release mr holds */
1804 	while (j) {
1805 		struct rvt_sge *sge = &wqe->sg_list[--j];
1806 
1807 		rvt_put_mr(sge->mr);
1808 	}
1809 	return ret;
1810 }
1811 
1812 /**
1813  * rvt_post_send - post a send on a QP
1814  * @ibqp: the QP to post the send on
1815  * @wr: the list of work requests to post
1816  * @bad_wr: the first bad WR is put here
1817  *
1818  * This may be called from interrupt context.
1819  *
1820  * Return: 0 on success else errno
1821  */
1822 int rvt_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
1823 		  struct ib_send_wr **bad_wr)
1824 {
1825 	struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
1826 	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
1827 	unsigned long flags = 0;
1828 	int call_send;
1829 	unsigned nreq = 0;
1830 	int err = 0;
1831 
1832 	spin_lock_irqsave(&qp->s_hlock, flags);
1833 
1834 	/*
1835 	 * Ensure QP state is such that we can send. If not bail out early,
1836 	 * there is no need to do this every time we post a send.
1837 	 */
1838 	if (unlikely(!(ib_rvt_state_ops[qp->state] & RVT_POST_SEND_OK))) {
1839 		spin_unlock_irqrestore(&qp->s_hlock, flags);
1840 		return -EINVAL;
1841 	}
1842 
1843 	/*
1844 	 * If the send queue is empty, and we only have a single WR then just go
1845 	 * ahead and kick the send engine into gear. Otherwise we will always
1846 	 * just schedule the send to happen later.
1847 	 */
1848 	call_send = qp->s_head == ACCESS_ONCE(qp->s_last) && !wr->next;
1849 
1850 	for (; wr; wr = wr->next) {
1851 		err = rvt_post_one_wr(qp, wr, &call_send);
1852 		if (unlikely(err)) {
1853 			*bad_wr = wr;
1854 			goto bail;
1855 		}
1856 		nreq++;
1857 	}
1858 bail:
1859 	spin_unlock_irqrestore(&qp->s_hlock, flags);
1860 	if (nreq) {
1861 		if (call_send)
1862 			rdi->driver_f.do_send(qp);
1863 		else
1864 			rdi->driver_f.schedule_send_no_lock(qp);
1865 	}
1866 	return err;
1867 }
1868 
1869 /**
1870  * rvt_post_srq_receive - post a receive on a shared receive queue
1871  * @ibsrq: the SRQ to post the receive on
1872  * @wr: the list of work requests to post
1873  * @bad_wr: A pointer to the first WR to cause a problem is put here
1874  *
1875  * This may be called from interrupt context.
1876  *
1877  * Return: 0 on success else errno
1878  */
1879 int rvt_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
1880 		      struct ib_recv_wr **bad_wr)
1881 {
1882 	struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq);
1883 	struct rvt_rwq *wq;
1884 	unsigned long flags;
1885 
1886 	for (; wr; wr = wr->next) {
1887 		struct rvt_rwqe *wqe;
1888 		u32 next;
1889 		int i;
1890 
1891 		if ((unsigned)wr->num_sge > srq->rq.max_sge) {
1892 			*bad_wr = wr;
1893 			return -EINVAL;
1894 		}
1895 
1896 		spin_lock_irqsave(&srq->rq.lock, flags);
1897 		wq = srq->rq.wq;
1898 		next = wq->head + 1;
1899 		if (next >= srq->rq.size)
1900 			next = 0;
1901 		if (next == wq->tail) {
1902 			spin_unlock_irqrestore(&srq->rq.lock, flags);
1903 			*bad_wr = wr;
1904 			return -ENOMEM;
1905 		}
1906 
1907 		wqe = rvt_get_rwqe_ptr(&srq->rq, wq->head);
1908 		wqe->wr_id = wr->wr_id;
1909 		wqe->num_sge = wr->num_sge;
1910 		for (i = 0; i < wr->num_sge; i++)
1911 			wqe->sg_list[i] = wr->sg_list[i];
1912 		/* Make sure queue entry is written before the head index. */
1913 		smp_wmb();
1914 		wq->head = next;
1915 		spin_unlock_irqrestore(&srq->rq.lock, flags);
1916 	}
1917 	return 0;
1918 }
1919 
1920 /**
1921  * qp_comm_est - handle trap with QP established
1922  * @qp: the QP
1923  */
1924 void rvt_comm_est(struct rvt_qp *qp)
1925 {
1926 	qp->r_flags |= RVT_R_COMM_EST;
1927 	if (qp->ibqp.event_handler) {
1928 		struct ib_event ev;
1929 
1930 		ev.device = qp->ibqp.device;
1931 		ev.element.qp = &qp->ibqp;
1932 		ev.event = IB_EVENT_COMM_EST;
1933 		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
1934 	}
1935 }
1936 EXPORT_SYMBOL(rvt_comm_est);
1937 
1938 void rvt_rc_error(struct rvt_qp *qp, enum ib_wc_status err)
1939 {
1940 	unsigned long flags;
1941 	int lastwqe;
1942 
1943 	spin_lock_irqsave(&qp->s_lock, flags);
1944 	lastwqe = rvt_error_qp(qp, err);
1945 	spin_unlock_irqrestore(&qp->s_lock, flags);
1946 
1947 	if (lastwqe) {
1948 		struct ib_event ev;
1949 
1950 		ev.device = qp->ibqp.device;
1951 		ev.element.qp = &qp->ibqp;
1952 		ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
1953 		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
1954 	}
1955 }
1956 EXPORT_SYMBOL(rvt_rc_error);
1957 
1958 /*
1959  *  rvt_rnr_tbl_to_usec - return index into ib_rvt_rnr_table
1960  *  @index - the index
1961  *  return usec from an index into ib_rvt_rnr_table
1962  */
1963 unsigned long rvt_rnr_tbl_to_usec(u32 index)
1964 {
1965 	return ib_rvt_rnr_table[(index & IB_AETH_CREDIT_MASK)];
1966 }
1967 EXPORT_SYMBOL(rvt_rnr_tbl_to_usec);
1968 
1969 static inline unsigned long rvt_aeth_to_usec(u32 aeth)
1970 {
1971 	return ib_rvt_rnr_table[(aeth >> IB_AETH_CREDIT_SHIFT) &
1972 				  IB_AETH_CREDIT_MASK];
1973 }
1974 
1975 /*
1976  *  rvt_add_retry_timer - add/start a retry timer
1977  *  @qp - the QP
1978  *  add a retry timer on the QP
1979  */
1980 void rvt_add_retry_timer(struct rvt_qp *qp)
1981 {
1982 	struct ib_qp *ibqp = &qp->ibqp;
1983 	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
1984 
1985 	lockdep_assert_held(&qp->s_lock);
1986 	qp->s_flags |= RVT_S_TIMER;
1987        /* 4.096 usec. * (1 << qp->timeout) */
1988 	qp->s_timer.expires = jiffies + qp->timeout_jiffies +
1989 			     rdi->busy_jiffies;
1990 	add_timer(&qp->s_timer);
1991 }
1992 EXPORT_SYMBOL(rvt_add_retry_timer);
1993 
1994 /**
1995  * rvt_add_rnr_timer - add/start an rnr timer
1996  * @qp - the QP
1997  * @aeth - aeth of RNR timeout, simulated aeth for loopback
1998  * add an rnr timer on the QP
1999  */
2000 void rvt_add_rnr_timer(struct rvt_qp *qp, u32 aeth)
2001 {
2002 	u32 to;
2003 
2004 	lockdep_assert_held(&qp->s_lock);
2005 	qp->s_flags |= RVT_S_WAIT_RNR;
2006 	to = rvt_aeth_to_usec(aeth);
2007 	hrtimer_start(&qp->s_rnr_timer,
2008 		      ns_to_ktime(1000 * to), HRTIMER_MODE_REL);
2009 }
2010 EXPORT_SYMBOL(rvt_add_rnr_timer);
2011 
2012 /**
2013  * rvt_stop_rc_timers - stop all timers
2014  * @qp - the QP
2015  * stop any pending timers
2016  */
2017 void rvt_stop_rc_timers(struct rvt_qp *qp)
2018 {
2019 	lockdep_assert_held(&qp->s_lock);
2020 	/* Remove QP from all timers */
2021 	if (qp->s_flags & (RVT_S_TIMER | RVT_S_WAIT_RNR)) {
2022 		qp->s_flags &= ~(RVT_S_TIMER | RVT_S_WAIT_RNR);
2023 		del_timer(&qp->s_timer);
2024 		hrtimer_try_to_cancel(&qp->s_rnr_timer);
2025 	}
2026 }
2027 EXPORT_SYMBOL(rvt_stop_rc_timers);
2028 
2029 /**
2030  * rvt_stop_rnr_timer - stop an rnr timer
2031  * @qp - the QP
2032  *
2033  * stop an rnr timer and return if the timer
2034  * had been pending.
2035  */
2036 static int rvt_stop_rnr_timer(struct rvt_qp *qp)
2037 {
2038 	int rval = 0;
2039 
2040 	lockdep_assert_held(&qp->s_lock);
2041 	/* Remove QP from rnr timer */
2042 	if (qp->s_flags & RVT_S_WAIT_RNR) {
2043 		qp->s_flags &= ~RVT_S_WAIT_RNR;
2044 		rval = hrtimer_try_to_cancel(&qp->s_rnr_timer);
2045 	}
2046 	return rval;
2047 }
2048 
2049 /**
2050  * rvt_del_timers_sync - wait for any timeout routines to exit
2051  * @qp - the QP
2052  */
2053 void rvt_del_timers_sync(struct rvt_qp *qp)
2054 {
2055 	del_timer_sync(&qp->s_timer);
2056 	hrtimer_cancel(&qp->s_rnr_timer);
2057 }
2058 EXPORT_SYMBOL(rvt_del_timers_sync);
2059 
2060 /**
2061  * This is called from s_timer for missing responses.
2062  */
2063 static void rvt_rc_timeout(unsigned long arg)
2064 {
2065 	struct rvt_qp *qp = (struct rvt_qp *)arg;
2066 	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
2067 	unsigned long flags;
2068 
2069 	spin_lock_irqsave(&qp->r_lock, flags);
2070 	spin_lock(&qp->s_lock);
2071 	if (qp->s_flags & RVT_S_TIMER) {
2072 		qp->s_flags &= ~RVT_S_TIMER;
2073 		del_timer(&qp->s_timer);
2074 		if (rdi->driver_f.notify_restart_rc)
2075 			rdi->driver_f.notify_restart_rc(qp,
2076 							qp->s_last_psn + 1,
2077 							1);
2078 		rdi->driver_f.schedule_send(qp);
2079 	}
2080 	spin_unlock(&qp->s_lock);
2081 	spin_unlock_irqrestore(&qp->r_lock, flags);
2082 }
2083 
2084 /*
2085  * This is called from s_timer for RNR timeouts.
2086  */
2087 enum hrtimer_restart rvt_rc_rnr_retry(struct hrtimer *t)
2088 {
2089 	struct rvt_qp *qp = container_of(t, struct rvt_qp, s_rnr_timer);
2090 	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
2091 	unsigned long flags;
2092 
2093 	spin_lock_irqsave(&qp->s_lock, flags);
2094 	rvt_stop_rnr_timer(qp);
2095 	rdi->driver_f.schedule_send(qp);
2096 	spin_unlock_irqrestore(&qp->s_lock, flags);
2097 	return HRTIMER_NORESTART;
2098 }
2099 EXPORT_SYMBOL(rvt_rc_rnr_retry);
2100