xref: /openbmc/linux/drivers/infiniband/sw/siw/siw_qp.c (revision b830f94f)
1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
2 
3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
4 /* Copyright (c) 2008-2019, IBM Corporation */
5 
6 #include <linux/errno.h>
7 #include <linux/types.h>
8 #include <linux/net.h>
9 #include <linux/scatterlist.h>
10 #include <linux/llist.h>
11 #include <asm/barrier.h>
12 #include <net/tcp.h>
13 
14 #include "siw.h"
15 #include "siw_verbs.h"
16 #include "siw_mem.h"
17 
18 static char siw_qp_state_to_string[SIW_QP_STATE_COUNT][sizeof "TERMINATE"] = {
19 	[SIW_QP_STATE_IDLE] = "IDLE",
20 	[SIW_QP_STATE_RTR] = "RTR",
21 	[SIW_QP_STATE_RTS] = "RTS",
22 	[SIW_QP_STATE_CLOSING] = "CLOSING",
23 	[SIW_QP_STATE_TERMINATE] = "TERMINATE",
24 	[SIW_QP_STATE_ERROR] = "ERROR"
25 };
26 
27 /*
28  * iWARP (RDMAP, DDP and MPA) parameters as well as Softiwarp settings on a
29  * per-RDMAP message basis. Please keep order of initializer. All MPA len
30  * is initialized to minimum packet size.
31  */
32 struct iwarp_msg_info iwarp_pktinfo[RDMAP_TERMINATE + 1] = {
33 	{ /* RDMAP_RDMA_WRITE */
34 	  .hdr_len = sizeof(struct iwarp_rdma_write),
35 	  .ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_write) - 2),
36 	  .ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST |
37 				 cpu_to_be16(DDP_VERSION << 8) |
38 				 cpu_to_be16(RDMAP_VERSION << 6) |
39 				 cpu_to_be16(RDMAP_RDMA_WRITE),
40 	  .rx_data = siw_proc_write },
41 	{ /* RDMAP_RDMA_READ_REQ */
42 	  .hdr_len = sizeof(struct iwarp_rdma_rreq),
43 	  .ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rreq) - 2),
44 	  .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
45 				 cpu_to_be16(RDMAP_VERSION << 6) |
46 				 cpu_to_be16(RDMAP_RDMA_READ_REQ),
47 	  .rx_data = siw_proc_rreq },
48 	{ /* RDMAP_RDMA_READ_RESP */
49 	  .hdr_len = sizeof(struct iwarp_rdma_rresp),
50 	  .ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rresp) - 2),
51 	  .ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST |
52 				 cpu_to_be16(DDP_VERSION << 8) |
53 				 cpu_to_be16(RDMAP_VERSION << 6) |
54 				 cpu_to_be16(RDMAP_RDMA_READ_RESP),
55 	  .rx_data = siw_proc_rresp },
56 	{ /* RDMAP_SEND */
57 	  .hdr_len = sizeof(struct iwarp_send),
58 	  .ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2),
59 	  .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
60 				 cpu_to_be16(RDMAP_VERSION << 6) |
61 				 cpu_to_be16(RDMAP_SEND),
62 	  .rx_data = siw_proc_send },
63 	{ /* RDMAP_SEND_INVAL */
64 	  .hdr_len = sizeof(struct iwarp_send_inv),
65 	  .ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2),
66 	  .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
67 				 cpu_to_be16(RDMAP_VERSION << 6) |
68 				 cpu_to_be16(RDMAP_SEND_INVAL),
69 	  .rx_data = siw_proc_send },
70 	{ /* RDMAP_SEND_SE */
71 	  .hdr_len = sizeof(struct iwarp_send),
72 	  .ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2),
73 	  .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
74 				 cpu_to_be16(RDMAP_VERSION << 6) |
75 				 cpu_to_be16(RDMAP_SEND_SE),
76 	  .rx_data = siw_proc_send },
77 	{ /* RDMAP_SEND_SE_INVAL */
78 	  .hdr_len = sizeof(struct iwarp_send_inv),
79 	  .ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2),
80 	  .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
81 				 cpu_to_be16(RDMAP_VERSION << 6) |
82 				 cpu_to_be16(RDMAP_SEND_SE_INVAL),
83 	  .rx_data = siw_proc_send },
84 	{ /* RDMAP_TERMINATE */
85 	  .hdr_len = sizeof(struct iwarp_terminate),
86 	  .ctrl.mpa_len = htons(sizeof(struct iwarp_terminate) - 2),
87 	  .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
88 				 cpu_to_be16(RDMAP_VERSION << 6) |
89 				 cpu_to_be16(RDMAP_TERMINATE),
90 	  .rx_data = siw_proc_terminate }
91 };
92 
93 void siw_qp_llp_data_ready(struct sock *sk)
94 {
95 	struct siw_qp *qp;
96 
97 	read_lock(&sk->sk_callback_lock);
98 
99 	if (unlikely(!sk->sk_user_data || !sk_to_qp(sk)))
100 		goto done;
101 
102 	qp = sk_to_qp(sk);
103 
104 	if (likely(!qp->rx_stream.rx_suspend &&
105 		   down_read_trylock(&qp->state_lock))) {
106 		read_descriptor_t rd_desc = { .arg.data = qp, .count = 1 };
107 
108 		if (likely(qp->attrs.state == SIW_QP_STATE_RTS))
109 			/*
110 			 * Implements data receive operation during
111 			 * socket callback. TCP gracefully catches
112 			 * the case where there is nothing to receive
113 			 * (not calling siw_tcp_rx_data() then).
114 			 */
115 			tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data);
116 
117 		up_read(&qp->state_lock);
118 	} else {
119 		siw_dbg_qp(qp, "unable to process RX, suspend: %d\n",
120 			   qp->rx_stream.rx_suspend);
121 	}
122 done:
123 	read_unlock(&sk->sk_callback_lock);
124 }
125 
126 void siw_qp_llp_close(struct siw_qp *qp)
127 {
128 	siw_dbg_qp(qp, "enter llp close, state = %s\n",
129 		   siw_qp_state_to_string[qp->attrs.state]);
130 
131 	down_write(&qp->state_lock);
132 
133 	qp->rx_stream.rx_suspend = 1;
134 	qp->tx_ctx.tx_suspend = 1;
135 	qp->attrs.sk = NULL;
136 
137 	switch (qp->attrs.state) {
138 	case SIW_QP_STATE_RTS:
139 	case SIW_QP_STATE_RTR:
140 	case SIW_QP_STATE_IDLE:
141 	case SIW_QP_STATE_TERMINATE:
142 		qp->attrs.state = SIW_QP_STATE_ERROR;
143 		break;
144 	/*
145 	 * SIW_QP_STATE_CLOSING:
146 	 *
147 	 * This is a forced close. shall the QP be moved to
148 	 * ERROR or IDLE ?
149 	 */
150 	case SIW_QP_STATE_CLOSING:
151 		if (tx_wqe(qp)->wr_status == SIW_WR_IDLE)
152 			qp->attrs.state = SIW_QP_STATE_ERROR;
153 		else
154 			qp->attrs.state = SIW_QP_STATE_IDLE;
155 		break;
156 
157 	default:
158 		siw_dbg_qp(qp, "llp close: no state transition needed: %s\n",
159 			   siw_qp_state_to_string[qp->attrs.state]);
160 		break;
161 	}
162 	siw_sq_flush(qp);
163 	siw_rq_flush(qp);
164 
165 	/*
166 	 * Dereference closing CEP
167 	 */
168 	if (qp->cep) {
169 		siw_cep_put(qp->cep);
170 		qp->cep = NULL;
171 	}
172 
173 	up_write(&qp->state_lock);
174 
175 	siw_dbg_qp(qp, "llp close exit: state %s\n",
176 		   siw_qp_state_to_string[qp->attrs.state]);
177 }
178 
179 /*
180  * socket callback routine informing about newly available send space.
181  * Function schedules SQ work for processing SQ items.
182  */
183 void siw_qp_llp_write_space(struct sock *sk)
184 {
185 	struct siw_cep *cep = sk_to_cep(sk);
186 
187 	cep->sk_write_space(sk);
188 
189 	if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
190 		(void)siw_sq_start(cep->qp);
191 }
192 
193 static int siw_qp_readq_init(struct siw_qp *qp, int irq_size, int orq_size)
194 {
195 	irq_size = roundup_pow_of_two(irq_size);
196 	orq_size = roundup_pow_of_two(orq_size);
197 
198 	qp->attrs.irq_size = irq_size;
199 	qp->attrs.orq_size = orq_size;
200 
201 	qp->irq = vzalloc(irq_size * sizeof(struct siw_sqe));
202 	if (!qp->irq) {
203 		siw_dbg_qp(qp, "irq malloc for %d failed\n", irq_size);
204 		qp->attrs.irq_size = 0;
205 		return -ENOMEM;
206 	}
207 	qp->orq = vzalloc(orq_size * sizeof(struct siw_sqe));
208 	if (!qp->orq) {
209 		siw_dbg_qp(qp, "orq malloc for %d failed\n", orq_size);
210 		qp->attrs.orq_size = 0;
211 		qp->attrs.irq_size = 0;
212 		vfree(qp->irq);
213 		return -ENOMEM;
214 	}
215 	siw_dbg_qp(qp, "ORD %d, IRD %d\n", orq_size, irq_size);
216 	return 0;
217 }
218 
219 static int siw_qp_enable_crc(struct siw_qp *qp)
220 {
221 	struct siw_rx_stream *c_rx = &qp->rx_stream;
222 	struct siw_iwarp_tx *c_tx = &qp->tx_ctx;
223 	int size = crypto_shash_descsize(siw_crypto_shash) +
224 			sizeof(struct shash_desc);
225 
226 	if (siw_crypto_shash == NULL)
227 		return -ENOENT;
228 
229 	c_tx->mpa_crc_hd = kzalloc(size, GFP_KERNEL);
230 	c_rx->mpa_crc_hd = kzalloc(size, GFP_KERNEL);
231 	if (!c_tx->mpa_crc_hd || !c_rx->mpa_crc_hd) {
232 		kfree(c_tx->mpa_crc_hd);
233 		kfree(c_rx->mpa_crc_hd);
234 		c_tx->mpa_crc_hd = NULL;
235 		c_rx->mpa_crc_hd = NULL;
236 		return -ENOMEM;
237 	}
238 	c_tx->mpa_crc_hd->tfm = siw_crypto_shash;
239 	c_rx->mpa_crc_hd->tfm = siw_crypto_shash;
240 
241 	return 0;
242 }
243 
244 /*
245  * Send a non signalled READ or WRITE to peer side as negotiated
246  * with MPAv2 P2P setup protocol. The work request is only created
247  * as a current active WR and does not consume Send Queue space.
248  *
249  * Caller must hold QP state lock.
250  */
251 int siw_qp_mpa_rts(struct siw_qp *qp, enum mpa_v2_ctrl ctrl)
252 {
253 	struct siw_wqe *wqe = tx_wqe(qp);
254 	unsigned long flags;
255 	int rv = 0;
256 
257 	spin_lock_irqsave(&qp->sq_lock, flags);
258 
259 	if (unlikely(wqe->wr_status != SIW_WR_IDLE)) {
260 		spin_unlock_irqrestore(&qp->sq_lock, flags);
261 		return -EIO;
262 	}
263 	memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
264 
265 	wqe->wr_status = SIW_WR_QUEUED;
266 	wqe->sqe.flags = 0;
267 	wqe->sqe.num_sge = 1;
268 	wqe->sqe.sge[0].length = 0;
269 	wqe->sqe.sge[0].laddr = 0;
270 	wqe->sqe.sge[0].lkey = 0;
271 	/*
272 	 * While it must not be checked for inbound zero length
273 	 * READ/WRITE, some HW may treat STag 0 special.
274 	 */
275 	wqe->sqe.rkey = 1;
276 	wqe->sqe.raddr = 0;
277 	wqe->processed = 0;
278 
279 	if (ctrl & MPA_V2_RDMA_WRITE_RTR)
280 		wqe->sqe.opcode = SIW_OP_WRITE;
281 	else if (ctrl & MPA_V2_RDMA_READ_RTR) {
282 		struct siw_sqe *rreq;
283 
284 		wqe->sqe.opcode = SIW_OP_READ;
285 
286 		spin_lock(&qp->orq_lock);
287 
288 		rreq = orq_get_free(qp);
289 		if (rreq) {
290 			siw_read_to_orq(rreq, &wqe->sqe);
291 			qp->orq_put++;
292 		} else
293 			rv = -EIO;
294 
295 		spin_unlock(&qp->orq_lock);
296 	} else
297 		rv = -EINVAL;
298 
299 	if (rv)
300 		wqe->wr_status = SIW_WR_IDLE;
301 
302 	spin_unlock_irqrestore(&qp->sq_lock, flags);
303 
304 	if (!rv)
305 		rv = siw_sq_start(qp);
306 
307 	return rv;
308 }
309 
310 /*
311  * Map memory access error to DDP tagged error
312  */
313 enum ddp_ecode siw_tagged_error(enum siw_access_state state)
314 {
315 	switch (state) {
316 	case E_STAG_INVALID:
317 		return DDP_ECODE_T_INVALID_STAG;
318 	case E_BASE_BOUNDS:
319 		return DDP_ECODE_T_BASE_BOUNDS;
320 	case E_PD_MISMATCH:
321 		return DDP_ECODE_T_STAG_NOT_ASSOC;
322 	case E_ACCESS_PERM:
323 		/*
324 		 * RFC 5041 (DDP) lacks an ecode for insufficient access
325 		 * permissions. 'Invalid STag' seem to be the closest
326 		 * match though.
327 		 */
328 		return DDP_ECODE_T_INVALID_STAG;
329 	default:
330 		WARN_ON(1);
331 		return DDP_ECODE_T_INVALID_STAG;
332 	}
333 }
334 
335 /*
336  * Map memory access error to RDMAP protection error
337  */
338 enum rdmap_ecode siw_rdmap_error(enum siw_access_state state)
339 {
340 	switch (state) {
341 	case E_STAG_INVALID:
342 		return RDMAP_ECODE_INVALID_STAG;
343 	case E_BASE_BOUNDS:
344 		return RDMAP_ECODE_BASE_BOUNDS;
345 	case E_PD_MISMATCH:
346 		return RDMAP_ECODE_STAG_NOT_ASSOC;
347 	case E_ACCESS_PERM:
348 		return RDMAP_ECODE_ACCESS_RIGHTS;
349 	default:
350 		return RDMAP_ECODE_UNSPECIFIED;
351 	}
352 }
353 
354 void siw_init_terminate(struct siw_qp *qp, enum term_elayer layer, u8 etype,
355 			u8 ecode, int in_tx)
356 {
357 	if (!qp->term_info.valid) {
358 		memset(&qp->term_info, 0, sizeof(qp->term_info));
359 		qp->term_info.layer = layer;
360 		qp->term_info.etype = etype;
361 		qp->term_info.ecode = ecode;
362 		qp->term_info.in_tx = in_tx;
363 		qp->term_info.valid = 1;
364 	}
365 	siw_dbg_qp(qp, "init TERM: layer %d, type %d, code %d, in tx %s\n",
366 		   layer, etype, ecode, in_tx ? "yes" : "no");
367 }
368 
369 /*
370  * Send a TERMINATE message, as defined in RFC's 5040/5041/5044/6581.
371  * Sending TERMINATE messages is best effort - such messages
372  * can only be send if the QP is still connected and it does
373  * not have another outbound message in-progress, i.e. the
374  * TERMINATE message must not interfer with an incomplete current
375  * transmit operation.
376  */
377 void siw_send_terminate(struct siw_qp *qp)
378 {
379 	struct kvec iov[3];
380 	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR };
381 	struct iwarp_terminate *term = NULL;
382 	union iwarp_hdr *err_hdr = NULL;
383 	struct socket *s = qp->attrs.sk;
384 	struct siw_rx_stream *srx = &qp->rx_stream;
385 	union iwarp_hdr *rx_hdr = &srx->hdr;
386 	u32 crc = 0;
387 	int num_frags, len_terminate, rv;
388 
389 	if (!qp->term_info.valid)
390 		return;
391 
392 	qp->term_info.valid = 0;
393 
394 	if (tx_wqe(qp)->wr_status == SIW_WR_INPROGRESS) {
395 		siw_dbg_qp(qp, "cannot send TERMINATE: op %d in progress\n",
396 			   tx_type(tx_wqe(qp)));
397 		return;
398 	}
399 	if (!s && qp->cep)
400 		/* QP not yet in RTS. Take socket from connection end point */
401 		s = qp->cep->sock;
402 
403 	if (!s) {
404 		siw_dbg_qp(qp, "cannot send TERMINATE: not connected\n");
405 		return;
406 	}
407 
408 	term = kzalloc(sizeof(*term), GFP_KERNEL);
409 	if (!term)
410 		return;
411 
412 	term->ddp_qn = cpu_to_be32(RDMAP_UNTAGGED_QN_TERMINATE);
413 	term->ddp_mo = 0;
414 	term->ddp_msn = cpu_to_be32(1);
415 
416 	iov[0].iov_base = term;
417 	iov[0].iov_len = sizeof(*term);
418 
419 	if ((qp->term_info.layer == TERM_ERROR_LAYER_DDP) ||
420 	    ((qp->term_info.layer == TERM_ERROR_LAYER_RDMAP) &&
421 	     (qp->term_info.etype != RDMAP_ETYPE_CATASTROPHIC))) {
422 		err_hdr = kzalloc(sizeof(*err_hdr), GFP_KERNEL);
423 		if (!err_hdr) {
424 			kfree(term);
425 			return;
426 		}
427 	}
428 	memcpy(&term->ctrl, &iwarp_pktinfo[RDMAP_TERMINATE].ctrl,
429 	       sizeof(struct iwarp_ctrl));
430 
431 	__rdmap_term_set_layer(term, qp->term_info.layer);
432 	__rdmap_term_set_etype(term, qp->term_info.etype);
433 	__rdmap_term_set_ecode(term, qp->term_info.ecode);
434 
435 	switch (qp->term_info.layer) {
436 	case TERM_ERROR_LAYER_RDMAP:
437 		if (qp->term_info.etype == RDMAP_ETYPE_CATASTROPHIC)
438 			/* No additional DDP/RDMAP header to be included */
439 			break;
440 
441 		if (qp->term_info.etype == RDMAP_ETYPE_REMOTE_PROTECTION) {
442 			/*
443 			 * Complete RDMAP frame will get attached, and
444 			 * DDP segment length is valid
445 			 */
446 			term->flag_m = 1;
447 			term->flag_d = 1;
448 			term->flag_r = 1;
449 
450 			if (qp->term_info.in_tx) {
451 				struct iwarp_rdma_rreq *rreq;
452 				struct siw_wqe *wqe = tx_wqe(qp);
453 
454 				/* Inbound RREQ error, detected during
455 				 * RRESP creation. Take state from
456 				 * current TX work queue element to
457 				 * reconstruct peers RREQ.
458 				 */
459 				rreq = (struct iwarp_rdma_rreq *)err_hdr;
460 
461 				memcpy(&rreq->ctrl,
462 				       &iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl,
463 				       sizeof(struct iwarp_ctrl));
464 
465 				rreq->rsvd = 0;
466 				rreq->ddp_qn =
467 					htonl(RDMAP_UNTAGGED_QN_RDMA_READ);
468 
469 				/* Provide RREQ's MSN as kept aside */
470 				rreq->ddp_msn = htonl(wqe->sqe.sge[0].length);
471 
472 				rreq->ddp_mo = htonl(wqe->processed);
473 				rreq->sink_stag = htonl(wqe->sqe.rkey);
474 				rreq->sink_to = cpu_to_be64(wqe->sqe.raddr);
475 				rreq->read_size = htonl(wqe->sqe.sge[0].length);
476 				rreq->source_stag = htonl(wqe->sqe.sge[0].lkey);
477 				rreq->source_to =
478 					cpu_to_be64(wqe->sqe.sge[0].laddr);
479 
480 				iov[1].iov_base = rreq;
481 				iov[1].iov_len = sizeof(*rreq);
482 
483 				rx_hdr = (union iwarp_hdr *)rreq;
484 			} else {
485 				/* Take RDMAP/DDP information from
486 				 * current (failed) inbound frame.
487 				 */
488 				iov[1].iov_base = rx_hdr;
489 
490 				if (__rdmap_get_opcode(&rx_hdr->ctrl) ==
491 				    RDMAP_RDMA_READ_REQ)
492 					iov[1].iov_len =
493 						sizeof(struct iwarp_rdma_rreq);
494 				else /* SEND type */
495 					iov[1].iov_len =
496 						sizeof(struct iwarp_send);
497 			}
498 		} else {
499 			/* Do not report DDP hdr information if packet
500 			 * layout is unknown
501 			 */
502 			if ((qp->term_info.ecode == RDMAP_ECODE_VERSION) ||
503 			    (qp->term_info.ecode == RDMAP_ECODE_OPCODE))
504 				break;
505 
506 			iov[1].iov_base = rx_hdr;
507 
508 			/* Only DDP frame will get attached */
509 			if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)
510 				iov[1].iov_len =
511 					sizeof(struct iwarp_rdma_write);
512 			else
513 				iov[1].iov_len = sizeof(struct iwarp_send);
514 
515 			term->flag_m = 1;
516 			term->flag_d = 1;
517 		}
518 		term->ctrl.mpa_len = cpu_to_be16(iov[1].iov_len);
519 		break;
520 
521 	case TERM_ERROR_LAYER_DDP:
522 		/* Report error encountered while DDP processing.
523 		 * This can only happen as a result of inbound
524 		 * DDP processing
525 		 */
526 
527 		/* Do not report DDP hdr information if packet
528 		 * layout is unknown
529 		 */
530 		if (((qp->term_info.etype == DDP_ETYPE_TAGGED_BUF) &&
531 		     (qp->term_info.ecode == DDP_ECODE_T_VERSION)) ||
532 		    ((qp->term_info.etype == DDP_ETYPE_UNTAGGED_BUF) &&
533 		     (qp->term_info.ecode == DDP_ECODE_UT_VERSION)))
534 			break;
535 
536 		iov[1].iov_base = rx_hdr;
537 
538 		if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)
539 			iov[1].iov_len = sizeof(struct iwarp_ctrl_tagged);
540 		else
541 			iov[1].iov_len = sizeof(struct iwarp_ctrl_untagged);
542 
543 		term->flag_m = 1;
544 		term->flag_d = 1;
545 		break;
546 
547 	default:
548 		break;
549 	}
550 	if (term->flag_m || term->flag_d || term->flag_r) {
551 		iov[2].iov_base = &crc;
552 		iov[2].iov_len = sizeof(crc);
553 		len_terminate = sizeof(*term) + iov[1].iov_len + MPA_CRC_SIZE;
554 		num_frags = 3;
555 	} else {
556 		iov[1].iov_base = &crc;
557 		iov[1].iov_len = sizeof(crc);
558 		len_terminate = sizeof(*term) + MPA_CRC_SIZE;
559 		num_frags = 2;
560 	}
561 
562 	/* Adjust DDP Segment Length parameter, if valid */
563 	if (term->flag_m) {
564 		u32 real_ddp_len = be16_to_cpu(rx_hdr->ctrl.mpa_len);
565 		enum rdma_opcode op = __rdmap_get_opcode(&rx_hdr->ctrl);
566 
567 		real_ddp_len -= iwarp_pktinfo[op].hdr_len - MPA_HDR_SIZE;
568 		rx_hdr->ctrl.mpa_len = cpu_to_be16(real_ddp_len);
569 	}
570 
571 	term->ctrl.mpa_len =
572 		cpu_to_be16(len_terminate - (MPA_HDR_SIZE + MPA_CRC_SIZE));
573 	if (qp->tx_ctx.mpa_crc_hd) {
574 		crypto_shash_init(qp->tx_ctx.mpa_crc_hd);
575 		if (crypto_shash_update(qp->tx_ctx.mpa_crc_hd,
576 					(u8 *)iov[0].iov_base,
577 					iov[0].iov_len))
578 			goto out;
579 
580 		if (num_frags == 3) {
581 			if (crypto_shash_update(qp->tx_ctx.mpa_crc_hd,
582 						(u8 *)iov[1].iov_base,
583 						iov[1].iov_len))
584 				goto out;
585 		}
586 		crypto_shash_final(qp->tx_ctx.mpa_crc_hd, (u8 *)&crc);
587 	}
588 
589 	rv = kernel_sendmsg(s, &msg, iov, num_frags, len_terminate);
590 	siw_dbg_qp(qp, "sent TERM: %s, layer %d, type %d, code %d (%d bytes)\n",
591 		   rv == len_terminate ? "success" : "failure",
592 		   __rdmap_term_layer(term), __rdmap_term_etype(term),
593 		   __rdmap_term_ecode(term), rv);
594 out:
595 	kfree(term);
596 	kfree(err_hdr);
597 }
598 
599 /*
600  * Handle all attrs other than state
601  */
602 static void siw_qp_modify_nonstate(struct siw_qp *qp,
603 				   struct siw_qp_attrs *attrs,
604 				   enum siw_qp_attr_mask mask)
605 {
606 	if (mask & SIW_QP_ATTR_ACCESS_FLAGS) {
607 		if (attrs->flags & SIW_RDMA_BIND_ENABLED)
608 			qp->attrs.flags |= SIW_RDMA_BIND_ENABLED;
609 		else
610 			qp->attrs.flags &= ~SIW_RDMA_BIND_ENABLED;
611 
612 		if (attrs->flags & SIW_RDMA_WRITE_ENABLED)
613 			qp->attrs.flags |= SIW_RDMA_WRITE_ENABLED;
614 		else
615 			qp->attrs.flags &= ~SIW_RDMA_WRITE_ENABLED;
616 
617 		if (attrs->flags & SIW_RDMA_READ_ENABLED)
618 			qp->attrs.flags |= SIW_RDMA_READ_ENABLED;
619 		else
620 			qp->attrs.flags &= ~SIW_RDMA_READ_ENABLED;
621 	}
622 }
623 
624 static int siw_qp_nextstate_from_idle(struct siw_qp *qp,
625 				      struct siw_qp_attrs *attrs,
626 				      enum siw_qp_attr_mask mask)
627 {
628 	int rv = 0;
629 
630 	switch (attrs->state) {
631 	case SIW_QP_STATE_RTS:
632 		if (attrs->flags & SIW_MPA_CRC) {
633 			rv = siw_qp_enable_crc(qp);
634 			if (rv)
635 				break;
636 		}
637 		if (!(mask & SIW_QP_ATTR_LLP_HANDLE)) {
638 			siw_dbg_qp(qp, "no socket\n");
639 			rv = -EINVAL;
640 			break;
641 		}
642 		if (!(mask & SIW_QP_ATTR_MPA)) {
643 			siw_dbg_qp(qp, "no MPA\n");
644 			rv = -EINVAL;
645 			break;
646 		}
647 		/*
648 		 * Initialize iWARP TX state
649 		 */
650 		qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 0;
651 		qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 0;
652 		qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 0;
653 
654 		/*
655 		 * Initialize iWARP RX state
656 		 */
657 		qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 1;
658 		qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 1;
659 		qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 1;
660 
661 		/*
662 		 * init IRD free queue, caller has already checked
663 		 * limits.
664 		 */
665 		rv = siw_qp_readq_init(qp, attrs->irq_size,
666 				       attrs->orq_size);
667 		if (rv)
668 			break;
669 
670 		qp->attrs.sk = attrs->sk;
671 		qp->attrs.state = SIW_QP_STATE_RTS;
672 
673 		siw_dbg_qp(qp, "enter RTS: crc=%s, ord=%u, ird=%u\n",
674 			   attrs->flags & SIW_MPA_CRC ? "y" : "n",
675 			   qp->attrs.orq_size, qp->attrs.irq_size);
676 		break;
677 
678 	case SIW_QP_STATE_ERROR:
679 		siw_rq_flush(qp);
680 		qp->attrs.state = SIW_QP_STATE_ERROR;
681 		if (qp->cep) {
682 			siw_cep_put(qp->cep);
683 			qp->cep = NULL;
684 		}
685 		break;
686 
687 	default:
688 		break;
689 	}
690 	return rv;
691 }
692 
693 static int siw_qp_nextstate_from_rts(struct siw_qp *qp,
694 				     struct siw_qp_attrs *attrs)
695 {
696 	int drop_conn = 0;
697 
698 	switch (attrs->state) {
699 	case SIW_QP_STATE_CLOSING:
700 		/*
701 		 * Verbs: move to IDLE if SQ and ORQ are empty.
702 		 * Move to ERROR otherwise. But first of all we must
703 		 * close the connection. So we keep CLOSING or ERROR
704 		 * as a transient state, schedule connection drop work
705 		 * and wait for the socket state change upcall to
706 		 * come back closed.
707 		 */
708 		if (tx_wqe(qp)->wr_status == SIW_WR_IDLE) {
709 			qp->attrs.state = SIW_QP_STATE_CLOSING;
710 		} else {
711 			qp->attrs.state = SIW_QP_STATE_ERROR;
712 			siw_sq_flush(qp);
713 		}
714 		siw_rq_flush(qp);
715 
716 		drop_conn = 1;
717 		break;
718 
719 	case SIW_QP_STATE_TERMINATE:
720 		qp->attrs.state = SIW_QP_STATE_TERMINATE;
721 
722 		siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
723 				   RDMAP_ETYPE_CATASTROPHIC,
724 				   RDMAP_ECODE_UNSPECIFIED, 1);
725 		drop_conn = 1;
726 		break;
727 
728 	case SIW_QP_STATE_ERROR:
729 		/*
730 		 * This is an emergency close.
731 		 *
732 		 * Any in progress transmit operation will get
733 		 * cancelled.
734 		 * This will likely result in a protocol failure,
735 		 * if a TX operation is in transit. The caller
736 		 * could unconditional wait to give the current
737 		 * operation a chance to complete.
738 		 * Esp., how to handle the non-empty IRQ case?
739 		 * The peer was asking for data transfer at a valid
740 		 * point in time.
741 		 */
742 		siw_sq_flush(qp);
743 		siw_rq_flush(qp);
744 		qp->attrs.state = SIW_QP_STATE_ERROR;
745 		drop_conn = 1;
746 		break;
747 
748 	default:
749 		break;
750 	}
751 	return drop_conn;
752 }
753 
754 static void siw_qp_nextstate_from_term(struct siw_qp *qp,
755 				       struct siw_qp_attrs *attrs)
756 {
757 	switch (attrs->state) {
758 	case SIW_QP_STATE_ERROR:
759 		siw_rq_flush(qp);
760 		qp->attrs.state = SIW_QP_STATE_ERROR;
761 
762 		if (tx_wqe(qp)->wr_status != SIW_WR_IDLE)
763 			siw_sq_flush(qp);
764 		break;
765 
766 	default:
767 		break;
768 	}
769 }
770 
771 static int siw_qp_nextstate_from_close(struct siw_qp *qp,
772 				       struct siw_qp_attrs *attrs)
773 {
774 	int rv = 0;
775 
776 	switch (attrs->state) {
777 	case SIW_QP_STATE_IDLE:
778 		WARN_ON(tx_wqe(qp)->wr_status != SIW_WR_IDLE);
779 		qp->attrs.state = SIW_QP_STATE_IDLE;
780 		break;
781 
782 	case SIW_QP_STATE_CLOSING:
783 		/*
784 		 * The LLP may already moved the QP to closing
785 		 * due to graceful peer close init
786 		 */
787 		break;
788 
789 	case SIW_QP_STATE_ERROR:
790 		/*
791 		 * QP was moved to CLOSING by LLP event
792 		 * not yet seen by user.
793 		 */
794 		qp->attrs.state = SIW_QP_STATE_ERROR;
795 
796 		if (tx_wqe(qp)->wr_status != SIW_WR_IDLE)
797 			siw_sq_flush(qp);
798 
799 		siw_rq_flush(qp);
800 		break;
801 
802 	default:
803 		siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
804 			   siw_qp_state_to_string[qp->attrs.state],
805 			   siw_qp_state_to_string[attrs->state]);
806 
807 		rv = -ECONNABORTED;
808 	}
809 	return rv;
810 }
811 
812 /*
813  * Caller must hold qp->state_lock
814  */
815 int siw_qp_modify(struct siw_qp *qp, struct siw_qp_attrs *attrs,
816 		  enum siw_qp_attr_mask mask)
817 {
818 	int drop_conn = 0, rv = 0;
819 
820 	if (!mask)
821 		return 0;
822 
823 	siw_dbg_qp(qp, "state: %s => %s\n",
824 		   siw_qp_state_to_string[qp->attrs.state],
825 		   siw_qp_state_to_string[attrs->state]);
826 
827 	if (mask != SIW_QP_ATTR_STATE)
828 		siw_qp_modify_nonstate(qp, attrs, mask);
829 
830 	if (!(mask & SIW_QP_ATTR_STATE))
831 		return 0;
832 
833 	switch (qp->attrs.state) {
834 	case SIW_QP_STATE_IDLE:
835 	case SIW_QP_STATE_RTR:
836 		rv = siw_qp_nextstate_from_idle(qp, attrs, mask);
837 		break;
838 
839 	case SIW_QP_STATE_RTS:
840 		drop_conn = siw_qp_nextstate_from_rts(qp, attrs);
841 		break;
842 
843 	case SIW_QP_STATE_TERMINATE:
844 		siw_qp_nextstate_from_term(qp, attrs);
845 		break;
846 
847 	case SIW_QP_STATE_CLOSING:
848 		siw_qp_nextstate_from_close(qp, attrs);
849 		break;
850 	default:
851 		break;
852 	}
853 	if (drop_conn)
854 		siw_qp_cm_drop(qp, 0);
855 
856 	return rv;
857 }
858 
859 void siw_read_to_orq(struct siw_sqe *rreq, struct siw_sqe *sqe)
860 {
861 	rreq->id = sqe->id;
862 	rreq->opcode = sqe->opcode;
863 	rreq->sge[0].laddr = sqe->sge[0].laddr;
864 	rreq->sge[0].length = sqe->sge[0].length;
865 	rreq->sge[0].lkey = sqe->sge[0].lkey;
866 	rreq->sge[1].lkey = sqe->sge[1].lkey;
867 	rreq->flags = sqe->flags | SIW_WQE_VALID;
868 	rreq->num_sge = 1;
869 }
870 
871 /*
872  * Must be called with SQ locked.
873  * To avoid complete SQ starvation by constant inbound READ requests,
874  * the active IRQ will not be served after qp->irq_burst, if the
875  * SQ has pending work.
876  */
877 int siw_activate_tx(struct siw_qp *qp)
878 {
879 	struct siw_sqe *irqe, *sqe;
880 	struct siw_wqe *wqe = tx_wqe(qp);
881 	int rv = 1;
882 
883 	irqe = &qp->irq[qp->irq_get % qp->attrs.irq_size];
884 
885 	if (irqe->flags & SIW_WQE_VALID) {
886 		sqe = sq_get_next(qp);
887 
888 		/*
889 		 * Avoid local WQE processing starvation in case
890 		 * of constant inbound READ request stream
891 		 */
892 		if (sqe && ++qp->irq_burst >= SIW_IRQ_MAXBURST_SQ_ACTIVE) {
893 			qp->irq_burst = 0;
894 			goto skip_irq;
895 		}
896 		memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
897 		wqe->wr_status = SIW_WR_QUEUED;
898 
899 		/* start READ RESPONSE */
900 		wqe->sqe.opcode = SIW_OP_READ_RESPONSE;
901 		wqe->sqe.flags = 0;
902 		if (irqe->num_sge) {
903 			wqe->sqe.num_sge = 1;
904 			wqe->sqe.sge[0].length = irqe->sge[0].length;
905 			wqe->sqe.sge[0].laddr = irqe->sge[0].laddr;
906 			wqe->sqe.sge[0].lkey = irqe->sge[0].lkey;
907 		} else {
908 			wqe->sqe.num_sge = 0;
909 		}
910 
911 		/* Retain original RREQ's message sequence number for
912 		 * potential error reporting cases.
913 		 */
914 		wqe->sqe.sge[1].length = irqe->sge[1].length;
915 
916 		wqe->sqe.rkey = irqe->rkey;
917 		wqe->sqe.raddr = irqe->raddr;
918 
919 		wqe->processed = 0;
920 		qp->irq_get++;
921 
922 		/* mark current IRQ entry free */
923 		smp_store_mb(irqe->flags, 0);
924 
925 		goto out;
926 	}
927 	sqe = sq_get_next(qp);
928 	if (sqe) {
929 skip_irq:
930 		memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
931 		wqe->wr_status = SIW_WR_QUEUED;
932 
933 		/* First copy SQE to kernel private memory */
934 		memcpy(&wqe->sqe, sqe, sizeof(*sqe));
935 
936 		if (wqe->sqe.opcode >= SIW_NUM_OPCODES) {
937 			rv = -EINVAL;
938 			goto out;
939 		}
940 		if (wqe->sqe.flags & SIW_WQE_INLINE) {
941 			if (wqe->sqe.opcode != SIW_OP_SEND &&
942 			    wqe->sqe.opcode != SIW_OP_WRITE) {
943 				rv = -EINVAL;
944 				goto out;
945 			}
946 			if (wqe->sqe.sge[0].length > SIW_MAX_INLINE) {
947 				rv = -EINVAL;
948 				goto out;
949 			}
950 			wqe->sqe.sge[0].laddr = (u64)&wqe->sqe.sge[1];
951 			wqe->sqe.sge[0].lkey = 0;
952 			wqe->sqe.num_sge = 1;
953 		}
954 		if (wqe->sqe.flags & SIW_WQE_READ_FENCE) {
955 			/* A READ cannot be fenced */
956 			if (unlikely(wqe->sqe.opcode == SIW_OP_READ ||
957 				     wqe->sqe.opcode ==
958 					     SIW_OP_READ_LOCAL_INV)) {
959 				siw_dbg_qp(qp, "cannot fence read\n");
960 				rv = -EINVAL;
961 				goto out;
962 			}
963 			spin_lock(&qp->orq_lock);
964 
965 			if (!siw_orq_empty(qp)) {
966 				qp->tx_ctx.orq_fence = 1;
967 				rv = 0;
968 			}
969 			spin_unlock(&qp->orq_lock);
970 
971 		} else if (wqe->sqe.opcode == SIW_OP_READ ||
972 			   wqe->sqe.opcode == SIW_OP_READ_LOCAL_INV) {
973 			struct siw_sqe *rreq;
974 
975 			wqe->sqe.num_sge = 1;
976 
977 			spin_lock(&qp->orq_lock);
978 
979 			rreq = orq_get_free(qp);
980 			if (rreq) {
981 				/*
982 				 * Make an immediate copy in ORQ to be ready
983 				 * to process loopback READ reply
984 				 */
985 				siw_read_to_orq(rreq, &wqe->sqe);
986 				qp->orq_put++;
987 			} else {
988 				qp->tx_ctx.orq_fence = 1;
989 				rv = 0;
990 			}
991 			spin_unlock(&qp->orq_lock);
992 		}
993 
994 		/* Clear SQE, can be re-used by application */
995 		smp_store_mb(sqe->flags, 0);
996 		qp->sq_get++;
997 	} else {
998 		rv = 0;
999 	}
1000 out:
1001 	if (unlikely(rv < 0)) {
1002 		siw_dbg_qp(qp, "error %d\n", rv);
1003 		wqe->wr_status = SIW_WR_IDLE;
1004 	}
1005 	return rv;
1006 }
1007 
1008 /*
1009  * Check if current CQ state qualifies for calling CQ completion
1010  * handler. Must be called with CQ lock held.
1011  */
1012 static bool siw_cq_notify_now(struct siw_cq *cq, u32 flags)
1013 {
1014 	u64 cq_notify;
1015 
1016 	if (!cq->base_cq.comp_handler)
1017 		return false;
1018 
1019 	cq_notify = READ_ONCE(*cq->notify);
1020 
1021 	if ((cq_notify & SIW_NOTIFY_NEXT_COMPLETION) ||
1022 	    ((cq_notify & SIW_NOTIFY_SOLICITED) &&
1023 	     (flags & SIW_WQE_SOLICITED))) {
1024 		/* dis-arm CQ */
1025 		smp_store_mb(*cq->notify, SIW_NOTIFY_NOT);
1026 
1027 		return true;
1028 	}
1029 	return false;
1030 }
1031 
1032 int siw_sqe_complete(struct siw_qp *qp, struct siw_sqe *sqe, u32 bytes,
1033 		     enum siw_wc_status status)
1034 {
1035 	struct siw_cq *cq = qp->scq;
1036 	int rv = 0;
1037 
1038 	if (cq) {
1039 		u32 sqe_flags = sqe->flags;
1040 		struct siw_cqe *cqe;
1041 		u32 idx;
1042 		unsigned long flags;
1043 
1044 		spin_lock_irqsave(&cq->lock, flags);
1045 
1046 		idx = cq->cq_put % cq->num_cqe;
1047 		cqe = &cq->queue[idx];
1048 
1049 		if (!READ_ONCE(cqe->flags)) {
1050 			bool notify;
1051 
1052 			cqe->id = sqe->id;
1053 			cqe->opcode = sqe->opcode;
1054 			cqe->status = status;
1055 			cqe->imm_data = 0;
1056 			cqe->bytes = bytes;
1057 
1058 			if (cq->kernel_verbs)
1059 				cqe->base_qp = qp->ib_qp;
1060 			else
1061 				cqe->qp_id = qp_id(qp);
1062 
1063 			/* mark CQE valid for application */
1064 			WRITE_ONCE(cqe->flags, SIW_WQE_VALID);
1065 			/* recycle SQE */
1066 			smp_store_mb(sqe->flags, 0);
1067 
1068 			cq->cq_put++;
1069 			notify = siw_cq_notify_now(cq, sqe_flags);
1070 
1071 			spin_unlock_irqrestore(&cq->lock, flags);
1072 
1073 			if (notify) {
1074 				siw_dbg_cq(cq, "Call completion handler\n");
1075 				cq->base_cq.comp_handler(&cq->base_cq,
1076 						cq->base_cq.cq_context);
1077 			}
1078 		} else {
1079 			spin_unlock_irqrestore(&cq->lock, flags);
1080 			rv = -ENOMEM;
1081 			siw_cq_event(cq, IB_EVENT_CQ_ERR);
1082 		}
1083 	} else {
1084 		/* recycle SQE */
1085 		smp_store_mb(sqe->flags, 0);
1086 	}
1087 	return rv;
1088 }
1089 
1090 int siw_rqe_complete(struct siw_qp *qp, struct siw_rqe *rqe, u32 bytes,
1091 		     u32 inval_stag, enum siw_wc_status status)
1092 {
1093 	struct siw_cq *cq = qp->rcq;
1094 	int rv = 0;
1095 
1096 	if (cq) {
1097 		struct siw_cqe *cqe;
1098 		u32 idx;
1099 		unsigned long flags;
1100 
1101 		spin_lock_irqsave(&cq->lock, flags);
1102 
1103 		idx = cq->cq_put % cq->num_cqe;
1104 		cqe = &cq->queue[idx];
1105 
1106 		if (!READ_ONCE(cqe->flags)) {
1107 			bool notify;
1108 			u8 cqe_flags = SIW_WQE_VALID;
1109 
1110 			cqe->id = rqe->id;
1111 			cqe->opcode = SIW_OP_RECEIVE;
1112 			cqe->status = status;
1113 			cqe->imm_data = 0;
1114 			cqe->bytes = bytes;
1115 
1116 			if (cq->kernel_verbs) {
1117 				cqe->base_qp = qp->ib_qp;
1118 				if (inval_stag) {
1119 					cqe_flags |= SIW_WQE_REM_INVAL;
1120 					cqe->inval_stag = inval_stag;
1121 				}
1122 			} else {
1123 				cqe->qp_id = qp_id(qp);
1124 			}
1125 			/* mark CQE valid for application */
1126 			WRITE_ONCE(cqe->flags, cqe_flags);
1127 			/* recycle RQE */
1128 			smp_store_mb(rqe->flags, 0);
1129 
1130 			cq->cq_put++;
1131 			notify = siw_cq_notify_now(cq, SIW_WQE_SIGNALLED);
1132 
1133 			spin_unlock_irqrestore(&cq->lock, flags);
1134 
1135 			if (notify) {
1136 				siw_dbg_cq(cq, "Call completion handler\n");
1137 				cq->base_cq.comp_handler(&cq->base_cq,
1138 						cq->base_cq.cq_context);
1139 			}
1140 		} else {
1141 			spin_unlock_irqrestore(&cq->lock, flags);
1142 			rv = -ENOMEM;
1143 			siw_cq_event(cq, IB_EVENT_CQ_ERR);
1144 		}
1145 	} else {
1146 		/* recycle RQE */
1147 		smp_store_mb(rqe->flags, 0);
1148 	}
1149 	return rv;
1150 }
1151 
1152 /*
1153  * siw_sq_flush()
1154  *
1155  * Flush SQ and ORRQ entries to CQ.
1156  *
1157  * Must be called with QP state write lock held.
1158  * Therefore, SQ and ORQ lock must not be taken.
1159  */
1160 void siw_sq_flush(struct siw_qp *qp)
1161 {
1162 	struct siw_sqe *sqe;
1163 	struct siw_wqe *wqe = tx_wqe(qp);
1164 	int async_event = 0;
1165 
1166 	/*
1167 	 * Start with completing any work currently on the ORQ
1168 	 */
1169 	while (qp->attrs.orq_size) {
1170 		sqe = &qp->orq[qp->orq_get % qp->attrs.orq_size];
1171 		if (!READ_ONCE(sqe->flags))
1172 			break;
1173 
1174 		if (siw_sqe_complete(qp, sqe, 0, SIW_WC_WR_FLUSH_ERR) != 0)
1175 			break;
1176 
1177 		WRITE_ONCE(sqe->flags, 0);
1178 		qp->orq_get++;
1179 	}
1180 	/*
1181 	 * Flush an in-progress WQE if present
1182 	 */
1183 	if (wqe->wr_status != SIW_WR_IDLE) {
1184 		siw_dbg_qp(qp, "flush current SQE, type %d, status %d\n",
1185 			   tx_type(wqe), wqe->wr_status);
1186 
1187 		siw_wqe_put_mem(wqe, tx_type(wqe));
1188 
1189 		if (tx_type(wqe) != SIW_OP_READ_RESPONSE &&
1190 		    ((tx_type(wqe) != SIW_OP_READ &&
1191 		      tx_type(wqe) != SIW_OP_READ_LOCAL_INV) ||
1192 		     wqe->wr_status == SIW_WR_QUEUED))
1193 			/*
1194 			 * An in-progress Read Request is already in
1195 			 * the ORQ
1196 			 */
1197 			siw_sqe_complete(qp, &wqe->sqe, wqe->bytes,
1198 					 SIW_WC_WR_FLUSH_ERR);
1199 
1200 		wqe->wr_status = SIW_WR_IDLE;
1201 	}
1202 	/*
1203 	 * Flush the Send Queue
1204 	 */
1205 	while (qp->attrs.sq_size) {
1206 		sqe = &qp->sendq[qp->sq_get % qp->attrs.sq_size];
1207 		if (!READ_ONCE(sqe->flags))
1208 			break;
1209 
1210 		async_event = 1;
1211 		if (siw_sqe_complete(qp, sqe, 0, SIW_WC_WR_FLUSH_ERR) != 0)
1212 			/*
1213 			 * Shall IB_EVENT_SQ_DRAINED be supressed if work
1214 			 * completion fails?
1215 			 */
1216 			break;
1217 
1218 		WRITE_ONCE(sqe->flags, 0);
1219 		qp->sq_get++;
1220 	}
1221 	if (async_event)
1222 		siw_qp_event(qp, IB_EVENT_SQ_DRAINED);
1223 }
1224 
1225 /*
1226  * siw_rq_flush()
1227  *
1228  * Flush recv queue entries to CQ. Also
1229  * takes care of pending active tagged and untagged
1230  * inbound transfers, which have target memory
1231  * referenced.
1232  *
1233  * Must be called with QP state write lock held.
1234  * Therefore, RQ lock must not be taken.
1235  */
1236 void siw_rq_flush(struct siw_qp *qp)
1237 {
1238 	struct siw_wqe *wqe = &qp->rx_untagged.wqe_active;
1239 
1240 	/*
1241 	 * Flush an in-progress untagged operation if present
1242 	 */
1243 	if (wqe->wr_status != SIW_WR_IDLE) {
1244 		siw_dbg_qp(qp, "flush current rqe, type %d, status %d\n",
1245 			   rx_type(wqe), wqe->wr_status);
1246 
1247 		siw_wqe_put_mem(wqe, rx_type(wqe));
1248 
1249 		if (rx_type(wqe) == SIW_OP_RECEIVE) {
1250 			siw_rqe_complete(qp, &wqe->rqe, wqe->bytes,
1251 					 0, SIW_WC_WR_FLUSH_ERR);
1252 		} else if (rx_type(wqe) != SIW_OP_READ &&
1253 			   rx_type(wqe) != SIW_OP_READ_RESPONSE &&
1254 			   rx_type(wqe) != SIW_OP_WRITE) {
1255 			siw_sqe_complete(qp, &wqe->sqe, 0, SIW_WC_WR_FLUSH_ERR);
1256 		}
1257 		wqe->wr_status = SIW_WR_IDLE;
1258 	}
1259 	wqe = &qp->rx_tagged.wqe_active;
1260 
1261 	if (wqe->wr_status != SIW_WR_IDLE) {
1262 		siw_wqe_put_mem(wqe, rx_type(wqe));
1263 		wqe->wr_status = SIW_WR_IDLE;
1264 	}
1265 	/*
1266 	 * Flush the Receive Queue
1267 	 */
1268 	while (qp->attrs.rq_size) {
1269 		struct siw_rqe *rqe =
1270 			&qp->recvq[qp->rq_get % qp->attrs.rq_size];
1271 
1272 		if (!READ_ONCE(rqe->flags))
1273 			break;
1274 
1275 		if (siw_rqe_complete(qp, rqe, 0, 0, SIW_WC_WR_FLUSH_ERR) != 0)
1276 			break;
1277 
1278 		WRITE_ONCE(rqe->flags, 0);
1279 		qp->rq_get++;
1280 	}
1281 }
1282 
1283 int siw_qp_add(struct siw_device *sdev, struct siw_qp *qp)
1284 {
1285 	int rv = xa_alloc(&sdev->qp_xa, &qp->ib_qp->qp_num, qp, xa_limit_32b,
1286 			  GFP_KERNEL);
1287 
1288 	if (!rv) {
1289 		kref_init(&qp->ref);
1290 		qp->sdev = sdev;
1291 		qp->qp_num = qp->ib_qp->qp_num;
1292 		siw_dbg_qp(qp, "new QP\n");
1293 	}
1294 	return rv;
1295 }
1296 
1297 void siw_free_qp(struct kref *ref)
1298 {
1299 	struct siw_qp *found, *qp = container_of(ref, struct siw_qp, ref);
1300 	struct siw_device *sdev = qp->sdev;
1301 	unsigned long flags;
1302 
1303 	if (qp->cep)
1304 		siw_cep_put(qp->cep);
1305 
1306 	found = xa_erase(&sdev->qp_xa, qp_id(qp));
1307 	WARN_ON(found != qp);
1308 	spin_lock_irqsave(&sdev->lock, flags);
1309 	list_del(&qp->devq);
1310 	spin_unlock_irqrestore(&sdev->lock, flags);
1311 
1312 	vfree(qp->sendq);
1313 	vfree(qp->recvq);
1314 	vfree(qp->irq);
1315 	vfree(qp->orq);
1316 
1317 	siw_put_tx_cpu(qp->tx_cpu);
1318 
1319 	atomic_dec(&sdev->num_qp);
1320 	siw_dbg_qp(qp, "free QP\n");
1321 	kfree_rcu(qp, rcu);
1322 }
1323